From dfe442c7173173de5f9079112dda2d609fa02365 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 17 Jan 2023 19:32:55 -0600
Subject: [PATCH 001/200] add c and asm srouce files from blst

---
 crypto/blst_src/LICENSE                       |  201 +
 crypto/blst_src/aggregate.c                   |  673 +++
 crypto/blst_src/asm/add_mod_256-armv8.pl      |  412 ++
 crypto/blst_src/asm/add_mod_256-x86_64.pl     |  547 +++
 crypto/blst_src/asm/add_mod_384-armv8.pl      |  937 ++++
 crypto/blst_src/asm/add_mod_384-x86_64.pl     | 1500 ++++++
 crypto/blst_src/asm/add_mod_384x384-x86_64.pl |  260 +
 crypto/blst_src/asm/arm-xlate.pl              |  386 ++
 .../blst_src/asm/ct_inverse_mod_256-armv8.pl  |  586 +++
 .../blst_src/asm/ct_inverse_mod_256-x86_64.pl |  837 ++++
 .../blst_src/asm/ct_inverse_mod_384-armv8.pl  |  610 +++
 .../asm/ct_is_square_mod_384-armv8.pl         |  401 ++
 .../asm/ct_is_square_mod_384-x86_64.pl        |  494 ++
 .../asm/ctq_inverse_mod_384-x86_64.pl         |  886 ++++
 .../asm/ctx_inverse_mod_384-x86_64.pl         |  995 ++++
 crypto/blst_src/asm/div3w-armv8.pl            |  122 +
 crypto/blst_src/asm/div3w-x86_64.pl           |  184 +
 crypto/blst_src/asm/mul_mont_256-armv8.pl     |  409 ++
 crypto/blst_src/asm/mul_mont_384-armv8.pl     | 2015 ++++++++
 crypto/blst_src/asm/mulq_mont_256-x86_64.pl   |  513 ++
 crypto/blst_src/asm/mulq_mont_384-x86_64.pl   | 2675 +++++++++++
 crypto/blst_src/asm/mulx_mont_256-x86_64.pl   |  486 ++
 crypto/blst_src/asm/mulx_mont_384-x86_64.pl   | 2384 ++++++++++
 crypto/blst_src/asm/sha256-armv8.pl           |  541 +++
 crypto/blst_src/asm/sha256-portable-x86_64.pl |  337 ++
 crypto/blst_src/asm/sha256-x86_64.pl          |  789 +++
 crypto/blst_src/asm/x86_64-xlate.pl           | 1781 +++++++
 crypto/blst_src/blst_t.hpp                    |  538 +++
 crypto/blst_src/build/assembly.S              |  123 +
 crypto/blst_src/build/bindings_trim.pl        |   37 +
 .../blst_src/build/coff/add_mod_256-armv8.S   |  397 ++
 .../blst_src/build/coff/add_mod_256-x86_64.s  |  911 ++++
 .../blst_src/build/coff/add_mod_384-armv8.S   | 1056 ++++
 .../blst_src/build/coff/add_mod_384-x86_64.s  | 2481 ++++++++++
 .../build/coff/add_mod_384x384-x86_64.s       |  326 ++
 .../build/coff/ct_inverse_mod_256-armv8.S     |  798 ++++
 .../build/coff/ct_inverse_mod_256-x86_64.s    | 1209 +++++
 .../build/coff/ct_inverse_mod_384-armv8.S     |  729 +++
 .../build/coff/ct_is_square_mod_384-armv8.S   |  334 ++
 .../build/coff/ct_is_square_mod_384-x86_64.s  |  505 ++
 .../build/coff/ctq_inverse_mod_384-x86_64.s   | 1221 +++++
 .../build/coff/ctx_inverse_mod_384-x86_64.s   | 1596 +++++++
 crypto/blst_src/build/coff/div3w-armv8.S      |   94 +
 crypto/blst_src/build/coff/div3w-x86_64.s     |  140 +
 .../blst_src/build/coff/mul_mont_256-armv8.S  |  474 ++
 .../blst_src/build/coff/mul_mont_384-armv8.S  | 2424 ++++++++++
 .../build/coff/mulq_mont_256-x86_64.s         |  872 ++++
 .../build/coff/mulq_mont_384-x86_64.s         | 4206 ++++++++++++++++
 .../build/coff/mulx_mont_256-x86_64.s         |  784 +++
 .../build/coff/mulx_mont_384-x86_64.s         | 3559 ++++++++++++++
 crypto/blst_src/build/coff/sha256-armv8.S     | 1087 +++++
 .../build/coff/sha256-portable-x86_64.s       | 1784 +++++++
 crypto/blst_src/build/coff/sha256-x86_64.s    | 1560 ++++++
 crypto/blst_src/build/elf/add_mod_256-armv8.S |  379 ++
 .../blst_src/build/elf/add_mod_256-x86_64.s   |  572 +++
 crypto/blst_src/build/elf/add_mod_384-armv8.S | 1000 ++++
 .../blst_src/build/elf/add_mod_384-x86_64.s   | 1907 ++++++++
 .../build/elf/add_mod_384x384-x86_64.s        |  252 +
 .../build/elf/ct_inverse_mod_256-armv8.S      |  784 +++
 .../build/elf/ct_inverse_mod_256-x86_64.s     | 1185 +++++
 .../build/elf/ct_inverse_mod_384-armv8.S      |  717 +++
 .../build/elf/ct_is_square_mod_384-armv8.S    |  324 ++
 .../build/elf/ct_is_square_mod_384-x86_64.s   |  479 ++
 .../build/elf/ctq_inverse_mod_384-x86_64.s    | 1195 +++++
 .../build/elf/ctx_inverse_mod_384-x86_64.s    | 1574 ++++++
 crypto/blst_src/build/elf/div3w-armv8.S       |   88 +
 crypto/blst_src/build/elf/div3w-x86_64.s      |  123 +
 .../blst_src/build/elf/mul_mont_256-armv8.S   |  464 ++
 .../blst_src/build/elf/mul_mont_384-armv8.S   | 2372 +++++++++
 .../blst_src/build/elf/mulq_mont_256-x86_64.s |  714 +++
 .../blst_src/build/elf/mulq_mont_384-x86_64.s | 3620 ++++++++++++++
 .../blst_src/build/elf/mulx_mont_256-x86_64.s |  627 +++
 .../blst_src/build/elf/mulx_mont_384-x86_64.s | 2968 ++++++++++++
 crypto/blst_src/build/elf/sha256-armv8.S      | 1077 +++++
 .../build/elf/sha256-portable-x86_64.s        | 1754 +++++++
 crypto/blst_src/build/elf/sha256-x86_64.s     | 1446 ++++++
 .../blst_src/build/mach-o/add_mod_256-armv8.S |  379 ++
 .../build/mach-o/add_mod_256-x86_64.s         |  564 +++
 .../blst_src/build/mach-o/add_mod_384-armv8.S | 1000 ++++
 .../build/mach-o/add_mod_384-x86_64.s         | 1899 ++++++++
 .../build/mach-o/add_mod_384x384-x86_64.s     |  244 +
 .../build/mach-o/ct_inverse_mod_256-armv8.S   |  784 +++
 .../build/mach-o/ct_inverse_mod_256-x86_64.s  | 1177 +++++
 .../build/mach-o/ct_inverse_mod_384-armv8.S   |  717 +++
 .../build/mach-o/ct_is_square_mod_384-armv8.S |  324 ++
 .../mach-o/ct_is_square_mod_384-x86_64.s      |  471 ++
 .../build/mach-o/ctq_inverse_mod_384-x86_64.s | 1187 +++++
 .../build/mach-o/ctx_inverse_mod_384-x86_64.s | 1566 ++++++
 crypto/blst_src/build/mach-o/div3w-armv8.S    |   88 +
 crypto/blst_src/build/mach-o/div3w-x86_64.s   |  115 +
 .../build/mach-o/mul_mont_256-armv8.S         |  464 ++
 .../build/mach-o/mul_mont_384-armv8.S         | 2372 +++++++++
 .../build/mach-o/mulq_mont_256-x86_64.s       |  706 +++
 .../build/mach-o/mulq_mont_384-x86_64.s       | 3612 ++++++++++++++
 .../build/mach-o/mulx_mont_256-x86_64.s       |  619 +++
 .../build/mach-o/mulx_mont_384-x86_64.s       | 2960 ++++++++++++
 crypto/blst_src/build/mach-o/sha256-armv8.S   | 1077 +++++
 .../build/mach-o/sha256-portable-x86_64.s     | 1746 +++++++
 crypto/blst_src/build/mach-o/sha256-x86_64.s  | 1438 ++++++
 crypto/blst_src/build/refresh.sh              |   49 +
 .../build/win64/add_mod_256-armv8.asm         |  380 ++
 .../build/win64/add_mod_256-x86_64.asm        |  934 ++++
 .../build/win64/add_mod_384-armv8.asm         | 1001 ++++
 .../build/win64/add_mod_384-x86_64.asm        | 2504 ++++++++++
 .../build/win64/add_mod_384x384-x86_64.asm    |  334 ++
 crypto/blst_src/build/win64/blst.def          |  217 +
 .../build/win64/ct_inverse_mod_256-armv8.asm  |  785 +++
 .../build/win64/ct_inverse_mod_256-x86_64.asm | 1211 +++++
 .../build/win64/ct_inverse_mod_384-armv8.asm  |  718 +++
 .../win64/ct_is_square_mod_384-armv8.asm      |  325 ++
 .../win64/ct_is_square_mod_384-x86_64.asm     |  509 ++
 .../win64/ctq_inverse_mod_384-x86_64.asm      | 1224 +++++
 .../win64/ctx_inverse_mod_384-x86_64.asm      | 1597 +++++++
 crypto/blst_src/build/win64/div3w-armv8.asm   |   89 +
 crypto/blst_src/build/win64/div3w-x86_64.asm  |  152 +
 crypto/blst_src/build/win64/dll.c             |   32 +
 .../build/win64/mul_mont_256-armv8.asm        |  465 ++
 .../build/win64/mul_mont_384-armv8.asm        | 2373 +++++++++
 .../build/win64/mulq_mont_256-x86_64.asm      |  884 ++++
 .../build/win64/mulq_mont_384-x86_64.asm      | 4233 +++++++++++++++++
 .../build/win64/mulx_mont_256-x86_64.asm      |  796 ++++
 .../build/win64/mulx_mont_384-x86_64.asm      | 3586 ++++++++++++++
 crypto/blst_src/build/win64/sha256-armv8.asm  | 1078 +++++
 crypto/blst_src/build/win64/sha256-x86_64.asm | 1570 ++++++
 crypto/blst_src/bulk_addition.c               |  168 +
 crypto/blst_src/bytes.h                       |  152 +
 crypto/blst_src/client_min_pk.c               |   17 +
 crypto/blst_src/client_min_sig.c              |   17 +
 crypto/blst_src/consts.c                      |   36 +
 crypto/blst_src/consts.h                      |   30 +
 crypto/blst_src/e1.c                          |  564 +++
 crypto/blst_src/e2.c                          |  638 +++
 crypto/blst_src/ec_mult.h                     |  289 ++
 crypto/blst_src/ec_ops.h                      |  787 +++
 crypto/blst_src/errors.h                      |   19 +
 crypto/blst_src/exp.c                         |   55 +
 crypto/blst_src/exports.c                     |  559 +++
 crypto/blst_src/fields.h                      |  116 +
 crypto/blst_src/fp12_tower.c                  |  789 +++
 crypto/blst_src/hash_to_field.c               |  177 +
 crypto/blst_src/keygen.c                      |  319 ++
 crypto/blst_src/map_to_g1.c                   |  559 +++
 crypto/blst_src/map_to_g2.c                   |  444 ++
 crypto/blst_src/multi_scalar.c                |  414 ++
 crypto/blst_src/no_asm.h                      | 1345 ++++++
 crypto/blst_src/pairing.c                     |  444 ++
 crypto/blst_src/pentaroot-addchain.h          |  333 ++
 crypto/blst_src/pentaroot.c                   |   76 +
 crypto/blst_src/point.h                       |   62 +
 crypto/blst_src/rb_tree.c                     |  145 +
 crypto/blst_src/recip-addchain.h              |  489 ++
 crypto/blst_src/recip.c                       |  139 +
 crypto/blst_src/server.c                      |   27 +
 crypto/blst_src/sha256.h                      |  140 +
 crypto/blst_src/sqrt-addchain.h               |  489 ++
 crypto/blst_src/sqrt.c                        |  261 +
 crypto/blst_src/vect.c                        |  176 +
 crypto/blst_src/vect.h                        |  418 ++
 158 files changed, 140075 insertions(+)
 create mode 100644 crypto/blst_src/LICENSE
 create mode 100644 crypto/blst_src/aggregate.c
 create mode 100755 crypto/blst_src/asm/add_mod_256-armv8.pl
 create mode 100755 crypto/blst_src/asm/add_mod_256-x86_64.pl
 create mode 100755 crypto/blst_src/asm/add_mod_384-armv8.pl
 create mode 100755 crypto/blst_src/asm/add_mod_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/add_mod_384x384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/arm-xlate.pl
 create mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
 create mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
 create mode 100755 crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
 create mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
 create mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/div3w-armv8.pl
 create mode 100755 crypto/blst_src/asm/div3w-x86_64.pl
 create mode 100755 crypto/blst_src/asm/mul_mont_256-armv8.pl
 create mode 100755 crypto/blst_src/asm/mul_mont_384-armv8.pl
 create mode 100755 crypto/blst_src/asm/mulq_mont_256-x86_64.pl
 create mode 100755 crypto/blst_src/asm/mulq_mont_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/mulx_mont_256-x86_64.pl
 create mode 100755 crypto/blst_src/asm/mulx_mont_384-x86_64.pl
 create mode 100755 crypto/blst_src/asm/sha256-armv8.pl
 create mode 100755 crypto/blst_src/asm/sha256-portable-x86_64.pl
 create mode 100755 crypto/blst_src/asm/sha256-x86_64.pl
 create mode 100755 crypto/blst_src/asm/x86_64-xlate.pl
 create mode 100644 crypto/blst_src/blst_t.hpp
 create mode 100644 crypto/blst_src/build/assembly.S
 create mode 100755 crypto/blst_src/build/bindings_trim.pl
 create mode 100644 crypto/blst_src/build/coff/add_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/coff/add_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/add_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/coff/add_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/div3w-armv8.S
 create mode 100644 crypto/blst_src/build/coff/div3w-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/mul_mont_256-armv8.S
 create mode 100644 crypto/blst_src/build/coff/mul_mont_384-armv8.S
 create mode 100644 crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/sha256-armv8.S
 create mode 100644 crypto/blst_src/build/coff/sha256-portable-x86_64.s
 create mode 100644 crypto/blst_src/build/coff/sha256-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/add_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/elf/add_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/add_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/elf/add_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/add_mod_384x384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/div3w-armv8.S
 create mode 100644 crypto/blst_src/build/elf/div3w-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/mul_mont_256-armv8.S
 create mode 100644 crypto/blst_src/build/elf/mul_mont_384-armv8.S
 create mode 100644 crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/sha256-armv8.S
 create mode 100644 crypto/blst_src/build/elf/sha256-portable-x86_64.s
 create mode 100644 crypto/blst_src/build/elf/sha256-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/add_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/add_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/add_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/add_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/div3w-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/div3w-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/mul_mont_256-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/mul_mont_384-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/sha256-armv8.S
 create mode 100644 crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
 create mode 100644 crypto/blst_src/build/mach-o/sha256-x86_64.s
 create mode 100755 crypto/blst_src/build/refresh.sh
 create mode 100644 crypto/blst_src/build/win64/add_mod_256-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/add_mod_256-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/add_mod_384-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/add_mod_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/blst.def
 create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/div3w-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/div3w-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/dll.c
 create mode 100644 crypto/blst_src/build/win64/mul_mont_256-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/mul_mont_384-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
 create mode 100644 crypto/blst_src/build/win64/sha256-armv8.asm
 create mode 100644 crypto/blst_src/build/win64/sha256-x86_64.asm
 create mode 100644 crypto/blst_src/bulk_addition.c
 create mode 100644 crypto/blst_src/bytes.h
 create mode 100644 crypto/blst_src/client_min_pk.c
 create mode 100644 crypto/blst_src/client_min_sig.c
 create mode 100644 crypto/blst_src/consts.c
 create mode 100644 crypto/blst_src/consts.h
 create mode 100644 crypto/blst_src/e1.c
 create mode 100644 crypto/blst_src/e2.c
 create mode 100644 crypto/blst_src/ec_mult.h
 create mode 100644 crypto/blst_src/ec_ops.h
 create mode 100644 crypto/blst_src/errors.h
 create mode 100644 crypto/blst_src/exp.c
 create mode 100644 crypto/blst_src/exports.c
 create mode 100644 crypto/blst_src/fields.h
 create mode 100644 crypto/blst_src/fp12_tower.c
 create mode 100644 crypto/blst_src/hash_to_field.c
 create mode 100644 crypto/blst_src/keygen.c
 create mode 100644 crypto/blst_src/map_to_g1.c
 create mode 100644 crypto/blst_src/map_to_g2.c
 create mode 100644 crypto/blst_src/multi_scalar.c
 create mode 100644 crypto/blst_src/no_asm.h
 create mode 100644 crypto/blst_src/pairing.c
 create mode 100644 crypto/blst_src/pentaroot-addchain.h
 create mode 100644 crypto/blst_src/pentaroot.c
 create mode 100644 crypto/blst_src/point.h
 create mode 100644 crypto/blst_src/rb_tree.c
 create mode 100644 crypto/blst_src/recip-addchain.h
 create mode 100644 crypto/blst_src/recip.c
 create mode 100644 crypto/blst_src/server.c
 create mode 100644 crypto/blst_src/sha256.h
 create mode 100644 crypto/blst_src/sqrt-addchain.h
 create mode 100644 crypto/blst_src/sqrt.c
 create mode 100644 crypto/blst_src/vect.c
 create mode 100644 crypto/blst_src/vect.h

diff --git a/crypto/blst_src/LICENSE b/crypto/blst_src/LICENSE
new file mode 100644
index 00000000000..261eeb9e9f8
--- /dev/null
+++ b/crypto/blst_src/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c
new file mode 100644
index 00000000000..8a24e0590ba
--- /dev/null
+++ b/crypto/blst_src/aggregate.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * Usage pattern on single-processor system is
+ *
+ * blst_pairing_init(ctx, hash_or_encode, DST);
+ * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]);
+ * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]);
+ * ...
+ * blst_pairing_commit(ctx);
+ * blst_pairing_finalverify(ctx, NULL);
+ *
+ ***********************************************************************
+ * Usage pattern on multi-processor system is
+ *
+ *   blst_pairing_init(pk[0], hash_or_encode, DST);
+ *   blst_pairing_init(pk[1], hash_or_encode, DST);
+ *   ...
+ * start threads each processing an N/nthreads slice of PKs and messages:
+ *     blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]);
+ *     blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]);
+ *     ...
+ *     blst_pairing_commit(pkx);
+ *   ...
+ * meanwhile in main thread
+ *   blst_fp12 gtsig;
+ *   blst_aggregated_in_g2(&gtsig, aggregated_signature);
+ * join threads and merge their contexts:
+ *   blst_pairing_merge(pk[0], pk[1]);
+ *   blst_pairing_merge(pk[0], pk[2]);
+ *   ...
+ *   blst_pairing_finalverify(pk[0], gtsig);
+ */
+
+#ifndef N_MAX
+# define N_MAX 8
+#endif
+
+typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature;
+typedef struct {
+    unsigned int ctrl;
+    unsigned int nelems;
+    const void *DST;
+    size_t DST_len;
+    vec384fp12 GT;
+    AggregatedSignature AggrSign;
+    POINTonE2_affine Q[N_MAX];
+    POINTonE1_affine P[N_MAX];
+} PAIRING;
+
+enum { AGGR_UNDEFINED      = 0,
+       AGGR_MIN_SIG        = 1,
+       AGGR_MIN_PK         = 2,
+       AGGR_SIGN_SET       = 0x10,
+       AGGR_GT_SET         = 0x20,
+       AGGR_HASH_OR_ENCODE = 0x40 };
+#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK)
+
+static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7;
+
+size_t blst_pairing_sizeof(void)
+{   return sizeof_pairing;   }
+
+void blst_pairing_init(PAIRING *ctx, int hash_or_encode,
+                       const void *DST, size_t DST_len)
+{
+    ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx->nelems = 0;
+    ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42
+                                                                 : DST;
+    ctx->DST_len = DST_len;
+}
+
+static const void *pairing_get_dst(const PAIRING *ctx)
+{   return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing
+                                        : ctx->DST;
+}
+
+const void *blst_pairing_get_dst(const PAIRING *ctx)
+{   return pairing_get_dst(ctx);   }
+
+#define FROM_AFFINE(out,in) do { \
+    vec_copy((out)->X, in->X, 2*sizeof(in->X)), \
+    vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \
+                         vec_is_zero(in->X, 2*sizeof(in->X))); } while(0)
+
+/*
+ * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated
+ * signature vetification as discussed at
+ * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407.
+ * Usage pattern is not finalized yet, because (sig != NULL) is better and
+ * will be handled separately...
+ */
+static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx,
+                                             const POINTonE2_affine *PK,
+                                             size_t pk_groupcheck,
+                                             const POINTonE1_affine *sig,
+                                             size_t sig_groupcheck,
+                                             const byte *scalar, size_t nbits,
+                                             const void *msg, size_t msg_len,
+                                             const void *aug, size_t aug_len)
+{
+    if (ctx->ctrl & AGGR_MIN_PK)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= AGGR_MIN_SIG;
+
+    /*
+     * Since we don't know if the signature is individual or aggregated,
+     * the only sensible thing to do is to skip over infinite one and
+     * count on the corresponding infinite public key to be rejected,
+     * in case the signature is non-aggregated that is.
+     */
+    if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) {
+        POINTonE1 *S = &ctx->AggrSign.e1;
+        POINTonE1 P[1];
+
+        FROM_AFFINE(P, sig);
+
+        if (sig_groupcheck && !POINTonE1_in_G1(P))
+            return BLST_POINT_NOT_IN_GROUP;
+
+        if (ctx->ctrl & AGGR_SIGN_SET) {
+            if (nbits != 0 && scalar != NULL) {
+                POINTonE1_mult_w5(P, P, scalar, nbits);
+                POINTonE1_dadd(S, S, P, NULL);
+            } else {
+                POINTonE1_dadd_affine(S, S, sig);
+            }
+        } else {
+            ctx->ctrl |= AGGR_SIGN_SET;
+            if (nbits != 0 && scalar != NULL)
+                POINTonE1_mult_w5(S, P, scalar, nbits);
+            else
+                vec_copy(S, P, sizeof(P));
+        }
+    }
+
+    if (PK != NULL) {
+        unsigned int n;
+        POINTonE1 H[1];
+        const void *DST = pairing_get_dst(ctx);
+
+        /*
+         * Reject infinite public keys.
+         */
+        if (vec_is_zero(PK, sizeof(*PK)))
+            return BLST_PK_IS_INFINITY;
+
+        if (pk_groupcheck) {
+            POINTonE2 P[1];
+
+            FROM_AFFINE(P, PK);
+            if (!POINTonE2_in_G2(P))
+                return BLST_POINT_NOT_IN_GROUP;
+        }
+
+        if (ctx->ctrl & AGGR_HASH_OR_ENCODE)
+            Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+        else
+            Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+
+        if (nbits != 0 && scalar != NULL)
+            POINTonE1_mult_w5(H, H, scalar, nbits);
+
+        POINTonE1_from_Jacobian(H, H);
+
+        n = ctx->nelems;
+        vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine));
+        vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine));
+        if (++n == N_MAX) {
+            if (ctx->ctrl & AGGR_GT_SET) {
+                vec384fp12 GT;
+                miller_loop_n(GT, ctx->Q, ctx->P, n);
+                mul_fp12(ctx->GT, ctx->GT, GT);
+            } else {
+                miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+                ctx->ctrl |= AGGR_GT_SET;
+            }
+            n = 0;
+        }
+        ctx->nelems = n;
+    }
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx,
+                                           const POINTonE2_affine *PK,
+                                           const POINTonE1_affine *signature,
+                                           const void *msg, size_t msg_len,
+                                           const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx,
+                                                 const POINTonE2_affine *PK,
+                                                 const POINTonE1_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const void *msg,
+                                                 size_t msg_len,
+                                                 const void *aug,
+                                                 size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx,
+                                            const POINTonE2_affine *PK,
+                                            size_t pk_grpchk,
+                                            const POINTonE1_affine *signature,
+                                            size_t sig_grpchk,
+                                            const void *msg, size_t msg_len,
+                                            const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk,
+                                      NULL, 0, msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx,
+                                                  const POINTonE2_affine *PK,
+                                                  size_t pk_grpchk,
+                                                  const POINTonE1_affine *sig,
+                                                  size_t sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const void *msg,
+                                                  size_t msg_len,
+                                                  const void *aug,
+                                                  size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk,
+                                      scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx,
+                                             const POINTonE1_affine *PK,
+                                             size_t pk_groupcheck,
+                                             const POINTonE2_affine *sig,
+                                             size_t sig_groupcheck,
+                                             const byte *scalar, size_t nbits,
+                                             const void *msg, size_t msg_len,
+                                             const void *aug, size_t aug_len)
+{
+    if (ctx->ctrl & AGGR_MIN_SIG)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= AGGR_MIN_PK;
+
+    /*
+     * Since we don't know if the signature is individual or aggregated,
+     * the only sensible thing to do is to skip over infinite one and
+     * count on the corresponding infinite public key to be rejected,
+     * in case the signature is non-aggregated that is.
+     */
+    if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) {
+        POINTonE2 *S = &ctx->AggrSign.e2;
+        POINTonE2 P[1];
+
+        FROM_AFFINE(P, sig);
+
+        if (sig_groupcheck && !POINTonE2_in_G2(P))
+            return BLST_POINT_NOT_IN_GROUP;
+
+        if (ctx->ctrl & AGGR_SIGN_SET) {
+            if (nbits != 0 && scalar != NULL) {
+
+                POINTonE2_mult_w5(P, P, scalar, nbits);
+                POINTonE2_dadd(S, S, P, NULL);
+            } else {
+                POINTonE2_dadd_affine(S, S, sig);
+            }
+        } else {
+            ctx->ctrl |= AGGR_SIGN_SET;
+            if (nbits != 0 && scalar != NULL)
+                POINTonE2_mult_w5(S, P, scalar, nbits);
+            else
+                vec_copy(S, P, sizeof(P));
+        }
+    }
+
+    if (PK != NULL) {
+        unsigned int n;
+        POINTonE2 H[1];
+        POINTonE1 pk[1];
+        const void *DST = pairing_get_dst(ctx);
+
+        /*
+         * Reject infinite public keys.
+         */
+        if (vec_is_zero(PK, sizeof(*PK)))
+            return BLST_PK_IS_INFINITY;
+
+        if (pk_groupcheck) {
+            POINTonE1 P[1];
+
+            FROM_AFFINE(P, PK);
+            if (!POINTonE1_in_G1(P))
+                return BLST_POINT_NOT_IN_GROUP;
+        }
+
+        if (ctx->ctrl & AGGR_HASH_OR_ENCODE)
+            Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+        else
+            Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len);
+
+        POINTonE2_from_Jacobian(H, H);
+
+        if (nbits != 0 && scalar != NULL) {
+            FROM_AFFINE(pk, PK);
+            POINTonE1_mult_w5(pk, pk, scalar, nbits);
+            POINTonE1_from_Jacobian(pk, pk);
+            PK = (const POINTonE1_affine *)pk;
+        }
+
+        n = ctx->nelems;
+        vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine));
+        vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine));
+        if (++n == N_MAX) {
+            if (ctx->ctrl & AGGR_GT_SET) {
+                vec384fp12 GT;
+                miller_loop_n(GT, ctx->Q, ctx->P, n);
+                mul_fp12(ctx->GT, ctx->GT, GT);
+            } else {
+                miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+                ctx->ctrl |= AGGR_GT_SET;
+            }
+            n = 0;
+        }
+        ctx->nelems = n;
+    }
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx,
+                                           const POINTonE1_affine *PK,
+                                           const POINTonE2_affine *signature,
+                                           const void *msg, size_t msg_len,
+                                           const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx,
+                                                 const POINTonE1_affine *PK,
+                                                 const POINTonE2_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const void *msg,
+                                                 size_t msg_len,
+                                                 const void *aug,
+                                                 size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx,
+                                            const POINTonE1_affine *PK,
+                                            size_t pk_grpchk,
+                                            const POINTonE2_affine *signature,
+                                            size_t sig_grpchk,
+                                            const void *msg, size_t msg_len,
+                                            const void *aug, size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk,
+                                      NULL, 0, msg, msg_len, aug, aug_len);
+}
+
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx,
+                                                  const POINTonE1_affine *PK,
+                                                  size_t pk_grpchk,
+                                                  const POINTonE2_affine *sig,
+                                                  size_t sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const void *msg,
+                                                  size_t msg_len,
+                                                  const void *aug,
+                                                  size_t aug_len)
+{   return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk,
+                                      scalar, nbits,
+                                      msg, msg_len, aug, aug_len);
+}
+
+static void PAIRING_Commit(PAIRING *ctx)
+{
+    unsigned int n;
+
+    if ((n = ctx->nelems) != 0) {
+        if (ctx->ctrl & AGGR_GT_SET) {
+            vec384fp12 GT;
+            miller_loop_n(GT, ctx->Q, ctx->P, n);
+            mul_fp12(ctx->GT, ctx->GT, GT);
+        } else {
+            miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+            ctx->ctrl |= AGGR_GT_SET;
+        }
+        ctx->nelems = 0;
+    }
+}
+
+void blst_pairing_commit(PAIRING *ctx)
+{   PAIRING_Commit(ctx);   }
+
+BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1)
+{
+    if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED
+        && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED
+        && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    /* context producers are expected to have called blst_pairing_commit */
+    if (ctx->nelems || ctx1->nelems)
+        return BLST_AGGR_TYPE_MISMATCH;
+
+    ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK;
+
+    switch (ctx->ctrl & MIN_SIG_OR_PK) {
+        case AGGR_MIN_SIG:
+            if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) {
+                POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1,
+                                                  &ctx1->AggrSign.e1, NULL);
+            } else if (ctx1->ctrl & AGGR_SIGN_SET) {
+                ctx->ctrl |= AGGR_SIGN_SET;
+                vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1,
+                         sizeof(ctx->AggrSign.e1));
+            }
+            break;
+        case AGGR_MIN_PK:
+            if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) {
+                POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2,
+                                                  &ctx1->AggrSign.e2, NULL);
+            } else if (ctx1->ctrl & AGGR_SIGN_SET) {
+                ctx->ctrl |= AGGR_SIGN_SET;
+                vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2,
+                         sizeof(ctx->AggrSign.e2));
+            }
+            break;
+        case AGGR_UNDEFINED:
+            break;
+        default:
+            return BLST_AGGR_TYPE_MISMATCH;
+    }
+
+    if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) {
+        mul_fp12(ctx->GT, ctx->GT, ctx1->GT);
+    } else if (ctx1->ctrl & AGGR_GT_SET) {
+        ctx->ctrl |= AGGR_GT_SET;
+        vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT));
+    }
+
+    return BLST_SUCCESS;
+}
+
+static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig)
+{
+    vec384fp12 GT;
+
+    if (!(ctx->ctrl & AGGR_GT_SET))
+        return 0;
+
+    if (GTsig != NULL) {
+        vec_copy(GT, GTsig, sizeof(GT));
+    } else if (ctx->ctrl & AGGR_SIGN_SET) {
+        AggregatedSignature AggrSign;
+
+        switch (ctx->ctrl & MIN_SIG_OR_PK) {
+            case AGGR_MIN_SIG:
+                POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1);
+                miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2,
+                                  (const POINTonE1_affine *)&AggrSign.e1, 1);
+                break;
+            case AGGR_MIN_PK:
+                POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2);
+                miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2,
+                                  (const POINTonE1_affine *)&BLS12_381_G1, 1);
+                break;
+            default:
+                return 0;
+        }
+    } else {
+        /*
+         * The aggregated signature was infinite, relation between the
+         * hashes and the public keys has to be VERY special...
+         */
+        vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT));
+    }
+
+    conjugate_fp12(GT);
+    mul_fp12(GT, GT, ctx->GT);
+    final_exp(GT, GT);
+
+    /* return GT==1 */
+    return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) &
+           vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]));
+}
+
+int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig)
+{   return (int)PAIRING_FinalVerify(ctx, GTsig);   }
+
+int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2)
+{
+    vec384fp12 GT;
+
+    vec_copy(GT, GT1, sizeof(GT));
+    conjugate_fp12(GT);
+    mul_fp12(GT, GT, GT2);
+    final_exp(GT, GT);
+
+    /* return GT==1 */
+    return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) &
+                 vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])));
+}
+
+void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q,
+                                              const POINTonE1_affine *p)
+{
+    unsigned int n;
+
+    if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p)))
+        return;
+
+    n = ctx->nelems;
+    vec_copy(ctx->Q + n, q, sizeof(*q));
+    vec_copy(ctx->P + n, p, sizeof(*p));
+    if (++n == N_MAX) {
+        if (ctx->ctrl & AGGR_GT_SET) {
+            vec384fp12 GT;
+            miller_loop_n(GT, ctx->Q, ctx->P, n);
+            mul_fp12(ctx->GT, ctx->GT, GT);
+        } else {
+            miller_loop_n(ctx->GT, ctx->Q, ctx->P, n);
+            ctx->ctrl |= AGGR_GT_SET;
+        }
+        n = 0;
+    }
+    ctx->nelems = n;
+}
+
+vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx)
+{
+    PAIRING_Commit(ctx);
+    return (vec384fp12 *)ctx->GT;
+}
+
+/*
+ * PAIRING context-free entry points.
+ *
+ * To perform FastAggregateVerify, aggregate all public keys and
+ * signatures with corresponding blst_aggregate_in_g{12}, convert
+ * result to affine and call suitable blst_core_verify_pk_in_g{12}
+ * or blst_aggregated_in_g{12}...
+ */
+BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in,
+                                                const unsigned char *zwire)
+{
+    POINTonE1 P[1];
+    BLST_ERROR ret;
+
+    ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire);
+
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    if (vec_is_zero(P, sizeof(POINTonE1_affine))) {
+        if (in == NULL)
+            vec_zero(out, sizeof(*out));
+        return BLST_SUCCESS;
+    }
+
+    vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z));
+
+    if (!POINTonE1_in_G1(P))
+        return BLST_POINT_NOT_IN_GROUP;
+
+    if (in == NULL)
+        vec_copy(out, P, sizeof(P));
+    else
+        POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P);
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in,
+                                                const unsigned char *zwire)
+{
+    POINTonE2 P[1];
+    BLST_ERROR ret;
+
+    ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire);
+
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    if (vec_is_zero(P, sizeof(POINTonE2_affine))) {
+        if (in == NULL)
+            vec_zero(out, sizeof(*out));
+        return BLST_SUCCESS;
+    }
+
+    vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z));
+
+    if (!POINTonE2_in_G2(P))
+        return BLST_POINT_NOT_IN_GROUP;
+
+    if (in == NULL) {
+        vec_copy(out, P, sizeof(P));
+    } else {
+        POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P);
+    }
+    return BLST_SUCCESS;
+}
+
+void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig)
+{   miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1);   }
+
+void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig)
+{   miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1);   }
+
+BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk,
+                                     const POINTonE2_affine *signature,
+                                     int hash_or_encode,
+                                     const void *msg, size_t msg_len,
+                                     const void *DST, size_t DST_len,
+                                     const void *aug, size_t aug_len)
+{
+    PAIRING ctx;
+    BLST_ERROR ret;
+
+    ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx.nelems = 0;
+    ctx.DST = DST;
+    ctx.DST_len = DST_len;
+
+    ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0,
+                                     msg, msg_len, aug, aug_len);
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    PAIRING_Commit(&ctx);
+
+    return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL;
+}
+
+BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk,
+                                     const POINTonE1_affine *signature,
+                                     int hash_or_encode,
+                                     const void *msg, size_t msg_len,
+                                     const void *DST, size_t DST_len,
+                                     const void *aug, size_t aug_len)
+{
+    PAIRING ctx;
+    BLST_ERROR ret;
+
+    ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0);
+    ctx.nelems = 0;
+    ctx.DST = DST;
+    ctx.DST_len = DST_len;
+
+    ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0,
+                                     msg, msg_len, aug, aug_len);
+    if (ret != BLST_SUCCESS)
+        return ret;
+
+    PAIRING_Commit(&ctx);
+
+    return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL;
+}
diff --git a/crypto/blst_src/asm/add_mod_256-armv8.pl b/crypto/blst_src/asm/add_mod_256-armv8.pl
new file mode 100755
index 00000000000..34d9145261b
--- /dev/null
+++ b/crypto/blst_src/asm/add_mod_256-armv8.pl
@@ -0,0 +1,412 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..7));
+@a=map("x$_",(8..11));
+@b=map("x$_",(12..15));
+@t=map("x$_",(16,17,1..3));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+
+	adds	@a[0],@b[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$b_ptr]
+	adcs	@a[1],@b[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	adcs	@a[2],@b[2],@b[2]
+	adcs	@a[3],@b[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@t[3],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_lshift_mod_256:
+	adds	@a[0],@a[0],@a[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+
+	cbnz	$b_ptr,.Loop_lshift_mod_256
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+.Loop_rshift:
+	adds	@b[0],@a[0],@mod[0]
+	sub	$b_ptr,$b_ptr,#1
+	adcs	@b[1],@a[1],@mod[1]
+	adcs	@b[2],@a[2],@mod[2]
+	adcs	@b[3],@a[3],@mod[3]
+	adc	@t[4],xzr,xzr
+	tst	@a[0],#1
+
+	csel	@b[0],@b[0],@a[0],ne
+	csel	@b[1],@b[1],@a[1],ne
+	csel	@b[2],@b[2],@a[2],ne
+	csel	@b[3],@b[3],@a[3],ne
+	csel	@t[4],@t[4],xzr,ne
+
+	extr	@a[0],@b[1],@b[0],#1
+	extr	@a[1],@b[2],@b[1],#1
+	extr	@a[2],@b[3],@b[2],#1
+	extr	@a[3],@t[4],@b[3],#1
+
+	cbnz	$b_ptr,.Loop_rshift
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@b[0],@mod[0],@a[0]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	 orr	@mod[0],@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	@mod[1],@a[2],@a[3]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	@t[4],@mod[0],@mod[1]
+	sbc	@b[3],@mod[3],@a[3]
+
+	cmp	@t[4],#0
+	csetm	@t[4],ne
+	ands	$b_ptr,$b_ptr,@t[4]
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+
+	 ldp	@a[2],@a[3],[$a_ptr,#16]
+	subs	@a[0],@a[0],@b[0]
+	 ldp	@b[2],@b[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	stp	@a[0],@a[1],[$r_ptr]
+	adc	@a[3],@a[3],@mod[3]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	subs	xzr,@a[0],@mod[0]
+	sbcs	xzr,@a[1],@mod[1]
+	orr	@a[0],@a[0],@a[1]
+	sbcs	xzr,@a[2],@mod[2]
+	orr	@a[0],@a[0],@a[2]
+	sbcs	xzr,@a[3],@mod[3]
+	orr	@a[0],@a[0],@a[3]
+	sbc	$a_ptr,xzr,xzr
+
+	cmp	@a[0],#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,$a_ptr
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	adds	@a[0],@a[0],@b[0]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	adcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adc	@t[4],xzr,xzr
+
+	subs	@t[0],@a[0],@mod[0]
+	sbcs	@t[1],@a[1],@mod[1]
+	sbcs	@t[2],@a[2],@mod[2]
+	sbcs	@t[3],@a[3],@mod[3]
+	sbcs	xzr,@t[4],xzr
+
+	csel	@a[0],@a[0],@t[0],lo
+	csel	@a[1],@a[1],@t[1],lo
+	csel	@a[2],@a[2],@t[2],lo
+	csel	@a[3],@a[3],@t[3],lo
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@b[0],@b[0]
+	rev	@a[1],@a[1]
+	rev	@b[1],@b[1]
+	rev	@a[2],@a[2]
+	rev	@b[2],@b[2]
+	rev	@a[3],@a[3]
+	rev	@b[3],@b[3]
+#endif
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	 ldp	@mod[0],@mod[1],[$n_ptr]
+	sbcs	@a[2],@a[2],@b[2]
+	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	sbcs	@a[3],@a[3],@b[3]
+	sbc	@t[4],xzr,xzr
+
+	 and	@mod[0],@mod[0],@t[4]
+	 and	@mod[1],@mod[1],@t[4]
+	adds	@a[0],@a[0],@mod[0]
+	 and	@mod[2],@mod[2],@t[4]
+	adcs	@a[1],@a[1],@mod[1]
+	 and	@mod[3],@mod[3],@t[4]
+	adcs	@a[2],@a[2],@mod[2]
+	adc	@a[3],@a[3],@mod[3]
+
+	orr	@t[0], @a[0], @a[1]
+	orr	@t[1], @a[2], @a[3]
+	orr	@t[0], @t[0], @t[1]
+
+#ifdef	__AARCH64EB__
+	rev	@a[0],@a[0]
+	rev	@a[1],@a[1]
+	rev	@a[2],@a[2]
+	rev	@a[3],@a[3]
+#endif
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	mov	@t[1], #1
+	cmp	@t[0], #0
+	csel	x0, @t[1], xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_256-x86_64.pl b/crypto/blst_src/asm/add_mod_256-x86_64.pl
new file mode 100755
index 00000000000..1d656fb90bf
--- /dev/null
+++ b/crypto/blst_src/asm/add_mod_256-x86_64.pl
@@ -0,0 +1,547 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits add
+my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,\@function,4,"unwind"
+.align	32
+add_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loaded_a_add_mod_256:
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_256,.-add_mod_256
+
+########################################################################
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	$b_org,$n_ptr
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	$a_ptr,$b_org
+	mov	8*3($a_ptr), @acc[3]
+
+	call	__lshift_mod_256
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,\@abi-omnipotent
+.align	32
+__lshift_mod_256:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	@acc[3], @acc[3]
+	sbb	@acc[8], @acc[8]
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, @acc[8]
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	cmovc	@acc[6], @acc[2]
+	cmovc	@acc[7], @acc[3]
+
+	ret
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+########################################################################
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,\@function,4,"unwind"
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	dec	%edx
+	jnz	.Loop_lshift_mod_256
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	lshift_mod_256,.-lshift_mod_256
+
+########################################################################
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,\@function,4,"unwind"
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[7]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+.Loop_rshift_mod_256:
+	mov	@acc[7], @acc[0]
+	and	\$1, @acc[7]
+	mov	8*0($n_ptr), @acc[4]
+	neg	@acc[7]
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+
+	and	@acc[7], @acc[4]
+	and	@acc[7], @acc[5]
+	and	@acc[7], @acc[6]
+	and	8*3($n_ptr), @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	sbb	@acc[4], @acc[4]
+
+	shr	\$1, @acc[0]
+	mov	@acc[1], @acc[7]
+	shr	\$1, @acc[1]
+	mov	@acc[2], @acc[6]
+	shr	\$1, @acc[2]
+	mov	@acc[3], @acc[5]
+	shr	\$1, @acc[3]
+
+	shl	\$63, @acc[7]
+	shl	\$63, @acc[6]
+	or	@acc[0], @acc[7]
+	shl	\$63, @acc[5]
+	or	@acc[6], @acc[1]
+	shl	\$63, @acc[4]
+	or	@acc[5], @acc[2]
+	or	@acc[4], @acc[3]
+
+	dec	%edx
+	jnz	.Loop_rshift_mod_256
+
+	mov	@acc[7], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	rshift_mod_256,.-rshift_mod_256
+
+########################################################################
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,\@function,4,"unwind"
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[8]	# load a[0:3]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	@acc[8], @acc[0]
+	mov	8*3($a_ptr), @acc[3]
+	or	@acc[1], @acc[8]
+	or	@acc[2], @acc[8]
+	or	@acc[3], @acc[8]
+	mov	\$-1, @acc[7]
+
+	mov	8*0($n_ptr), @acc[4]	# load n[0:3]
+	cmovnz	@acc[7], @acc[8]	# mask = a[0:3] ? -1 : 0
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	and	@acc[8], @acc[4]	# n[0:3] &= mask
+	mov	8*3($n_ptr), @acc[7]
+	and	@acc[8], @acc[5]
+	and	@acc[8], @acc[6]
+	and	@acc[8], @acc[7]
+
+	sub	@acc[0], @acc[4]	# a[0:3] ? n[0:3]-a[0:3] : 0-0
+	sbb	@acc[1], @acc[5]
+	sbb	@acc[2], @acc[6]
+	sbb	@acc[3], @acc[7]
+
+	or	$b_org, $b_org		# check condition flag
+
+	cmovz	@acc[0], @acc[4]	# flag ? n[0:3]-a[0:3] : a[0:3]
+	cmovz	@acc[1], @acc[5]
+	mov	@acc[4], 8*0($r_ptr)
+	cmovz	@acc[2], @acc[6]
+	mov	@acc[5], 8*1($r_ptr)
+	cmovz	@acc[3], @acc[7]
+	mov	@acc[6], 8*2($r_ptr)
+	mov	@acc[7], 8*3($r_ptr)
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	cneg_mod_256,.-cneg_mod_256
+
+########################################################################
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,\@function,4,"unwind"
+.align	32
+sub_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_256,.-sub_mod_256
+
+########################################################################
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,\@function,2,"unwind"
+.align	32
+check_mod_256:
+.cfi_startproc
+	mov	8*0($r_ptr), %rax
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+
+	mov	%rax, @acc[0]		# see if it's zero
+	or	@acc[1], %rax
+	or	@acc[2], %rax
+	or	@acc[3], %rax
+
+	sub	8*0($a_ptr), @acc[0]	# does subtracting modulus borrow?
+	sbb	8*1($a_ptr), @acc[1]
+	sbb	8*2($a_ptr), @acc[2]
+	sbb	8*3($a_ptr), @acc[3]
+	sbb	$a_ptr, $a_ptr
+
+	mov	\$1, %rdx
+	cmp	\$0, %rax
+	cmovne	%rdx, %rax
+	and	$a_ptr, %rax
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	check_mod_256,.-check_mod_256
+
+########################################################################
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,\@function,4,"unwind"
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	 mov	@acc[0], @acc[4]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[1], @acc[5]
+	adc	8*3($b_org), @acc[3]
+	sbb	$b_org, $b_org
+
+	 mov	@acc[2], @acc[6]
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], @acc[7]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[4], @acc[0]
+	cmovc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+########################################################################
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,\@function,4,"unwind"
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[4]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[5]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[6]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[7]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[6], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[7], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	or	@acc[1], @acc[0]
+	or	@acc[3], @acc[2]
+	or	@acc[2], @acc[0]
+	mov	\$1, %rax
+	cmovz	@acc[0], %rax
+
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384-armv8.pl b/crypto/blst_src/asm/add_mod_384-armv8.pl
new file mode 100755
index 00000000000..6accdbb19a1
--- /dev/null
+++ b/crypto/blst_src/asm/add_mod_384-armv8.pl
@@ -0,0 +1,937 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
+
+@mod=map("x$_",(4..9));
+@a=map("x$_",(10..15));
+@b=map("x$_",(16,17,19..22));
+$carry=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	@a[0],@a[0],@b[0]
+	adcs	@a[1],@a[1],@b[1]
+	adcs	@a[2],@a[2],@b[2]
+	adcs	@a[3],@a[3],@b[3]
+	adcs	@a[4],@a[4],@b[4]
+	adcs	@a[5],@a[5],@b[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_rshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__rshift_mod_384
+	cbnz	$b_ptr,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	@b[5],@a[0],#0,#1
+	 and	@b[0],@b[5],@mod[0]
+	 and	@b[1],@b[5],@mod[1]
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@b[5],@mod[2]
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@b[5],@mod[3]
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@b[5],@mod[4]
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@b[5],@mod[5]
+	adcs	@a[4],@a[4],@b[4]
+	 extr	@a[0],@a[1],@a[0],#1	// a[0:5] >>= 1
+	adcs	@a[5],@a[5],@b[5]
+	 extr	@a[1],@a[2],@a[1],#1
+	adc	@b[5],xzr,xzr
+	 extr	@a[2],@a[3],@a[2],#1
+	 extr	@a[3],@a[4],@a[3],#1
+	 extr	@a[4],@a[5],@a[4],#1
+	 extr	@a[5],@b[5],@a[5],#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+.Loop_lshift_mod_384:
+	sub	$b_ptr,$b_ptr,#1
+	bl	__lshift_mod_384
+	cbnz	$b_ptr,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@b[0],@a[0],@mod[0]
+	sbcs	@b[1],@a[1],@mod[1]
+	sbcs	@b[2],@a[2],@mod[2]
+	sbcs	@b[3],@a[3],@mod[3]
+	sbcs	@b[4],@a[4],@mod[4]
+	sbcs	@b[5],@a[5],@mod[5]
+	sbcs	xzr,$carry,xzr
+
+	csel	@a[0],@a[0],@b[0],lo
+	csel	@a[1],@a[1],@b[1],lo
+	csel	@a[2],@a[2],@b[2],lo
+	csel	@a[3],@a[3],@b[3],lo
+	csel	@a[4],@a[4],@b[4],lo
+	csel	@a[5],@a[5],@b[5],lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	@b[0],@b[1],[$a_ptr,#48]
+	ldp	@b[2],@b[3],[$a_ptr,#64]
+	ldp	@b[4],@b[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+
+	subs	@b[0],@mod[0],@a[0]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+	 orr	$carry,@a[0],@a[1]
+	sbcs	@b[1],@mod[1],@a[1]
+	 orr	$carry,$carry,@a[2]
+	sbcs	@b[2],@mod[2],@a[2]
+	 orr	$carry,$carry,@a[3]
+	sbcs	@b[3],@mod[3],@a[3]
+	 orr	$carry,$carry,@a[4]
+	sbcs	@b[4],@mod[4],@a[4]
+	 orr	$carry,$carry,@a[5]
+	sbc	@b[5],@mod[5],@a[5]
+
+	cmp	$carry,#0
+	csetm	$carry,ne
+	ands	$b_ptr,$b_ptr,$carry
+
+	csel	@a[0],@a[0],@b[0],eq
+	csel	@a[1],@a[1],@b[1],eq
+	csel	@a[2],@a[2],@b[2],eq
+	csel	@a[3],@a[3],@b[3],eq
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[4],@a[4],@b[4],eq
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	csel	@a[5],@a[5],@b[5],eq
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+	subs	@a[0],@a[0],@b[0]
+	sbcs	@a[1],@a[1],@b[1]
+	sbcs	@a[2],@a[2],@b[2]
+	sbcs	@a[3],@a[3],@b[3]
+	sbcs	@a[4],@a[4],@b[4]
+	sbcs	@a[5],@a[5],@b[5]
+	sbc	$carry,xzr,xzr
+
+	 and	@b[0],@mod[0],$carry
+	 and	@b[1],@mod[1],$carry
+	adds	@a[0],@a[0],@b[0]
+	 and	@b[2],@mod[2],$carry
+	adcs	@a[1],@a[1],@b[1]
+	 and	@b[3],@mod[3],$carry
+	adcs	@a[2],@a[2],@b[2]
+	 and	@b[4],@mod[4],$carry
+	adcs	@a[3],@a[3],@b[3]
+	 and	@b[5],@mod[5],$carry
+	adcs	@a[4],@a[4],@b[4]
+	adc	@a[5],@a[5],@b[5]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384
+
+	stp	@a[0],@a[1],[$r_ptr]
+	add	$a_ptr,$a_ptr,#48
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	add	$b_ptr,$b_ptr,#48
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+	add	$b_ptr,$a_ptr,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	@b[0],@b[1],[$a_ptr]
+	ldp	@b[2],@b[3],[$a_ptr,#16]
+	ldp	@b[4],@b[5],[$a_ptr,#32]
+	stp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[0],@a[1],[$a_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[2],@a[3],[$a_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	ldp	@a[4],@a[5],[$a_ptr,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$r_ptr,@a[0],#1
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$carry,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$carry,$carry,xzr
+
+	mvn	$carry,$carry
+	and	$carry,$carry,#2
+	orr	$r_ptr,$r_ptr,$carry
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+
+	and	$b_ptr,@a[0],#1
+	 orr	$n_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$n_ptr,$n_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$n_ptr,$n_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$n_ptr,$n_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$n_ptr,$n_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	ldp	@a[0],@a[1],[$r_ptr,#48]
+	ldp	@a[2],@a[3],[$r_ptr,#64]
+	ldp	@a[4],@a[5],[$r_ptr,#80]
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$b_ptr,$b_ptr,@b[0]
+
+	and	$r_ptr,@a[0],#1
+	 orr	$a_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$a_ptr,$a_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$a_ptr,$a_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$a_ptr,$a_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$a_ptr,$a_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@b[0],xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	@b[0],@b[0],xzr
+
+	mvn	@b[0],@b[0]
+	and	@b[0],@b[0],#2
+	orr	$r_ptr,$r_ptr,@b[0]
+
+	cmp	$n_ptr,#0
+	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	$a_ptr,#0
+	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	$n_ptr,$n_ptr,#1
+	and	$a_ptr,$a_ptr,#2
+	orr	$r_ptr,$a_ptr,$n_ptr	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+___
+if (1) {
+sub vec_select {
+my $sz = shift;
+my @v=map("v$_",(0..5,16..21));
+
+$code.=<<___;
+.globl	vec_select_$sz
+.hidden	vec_select_$sz
+.type	vec_select_$sz,%function
+.align	5
+vec_select_$sz:
+	dup	v6.2d, $n_ptr
+	ld1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
+___
+for($i=0; $i<$sz-48; $i+=48) {
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	ld1	{@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	ld1	{@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
+___
+	@v = @v[6..11,0..5];
+}
+$code.=<<___;
+	bit	@v[0].16b, @v[3].16b, v6.16b
+	bit	@v[1].16b, @v[4].16b, v6.16b
+	bit	@v[2].16b, @v[5].16b, v6.16b
+	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
+	ret
+.size	vec_select_$sz,.-vec_select_$sz
+___
+}
+vec_select(32);
+vec_select(48);
+vec_select(96);
+vec_select(192);
+vec_select(144);
+vec_select(288);
+}
+
+{
+my ($inp, $end, $step) = map("x$_", (0..2));
+
+$code.=<<___;
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	$end, $end, $inp
+	sub	$end, $end, #1
+	mov	$step, #64
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	csel	$step, xzr, $step, hi
+	prfm	pldl1keep, [$inp]
+	add	$inp, $inp, $step
+	cmp	$inp, $end
+	csel	$inp, $end, $inp, hi
+	prfm	pldl1keep, [$inp]
+	ret
+.size	vec_prefetch,.-vec_prefetch
+___
+my $len = $end;
+
+$code.=<<___;
+.globl	vec_is_zero_16x
+.hidden	vec_is_zero_16x
+.type	vec_is_zero_16x,%function
+.align	5
+vec_is_zero_16x:
+	ld1	{v0.2d}, [$inp], #16
+	lsr	$len, $len, #4
+	sub	$len, $len, #1
+	cbz	$len, .Loop_is_zero_done
+
+.Loop_is_zero:
+	ld1	{v1.2d}, [$inp], #16
+	orr	v0.16b, v0.16b, v1.16b
+	sub	$len, $len, #1
+	cbnz	$len, .Loop_is_zero
+
+.Loop_is_zero_done:
+	dup	v1.2d, v0.2d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.2d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+.size	vec_is_zero_16x,.-vec_is_zero_16x
+___
+}
+{
+my ($inp1, $inp2, $len) = map("x$_", (0..2));
+
+$code.=<<___;
+.globl	vec_is_equal_16x
+.hidden	vec_is_equal_16x
+.type	vec_is_equal_16x,%function
+.align	5
+vec_is_equal_16x:
+	ld1	{v0.2d}, [$inp1], #16
+	ld1	{v1.2d}, [$inp2], #16
+	lsr	$len, $len, #4
+	eor	v0.16b, v0.16b, v1.16b
+
+.Loop_is_equal:
+	sub	$len, $len, #1
+	cbz	$len, .Loop_is_equal_done
+	ld1	{v1.2d}, [$inp1], #16
+	ld1	{v2.2d}, [$inp2], #16
+	eor	v1.16b, v1.16b, v2.16b
+	orr	v0.16b, v0.16b, v1.16b
+	b	.Loop_is_equal
+	nop
+
+.Loop_is_equal_done:
+	dup	v1.2d, v0.2d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.2d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+.size	vec_is_equal_16x,.-vec_is_equal_16x
+___
+}
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384-x86_64.pl b/crypto/blst_src/asm/add_mod_384-x86_64.pl
new file mode 100755
index 00000000000..a196191c108
--- /dev/null
+++ b/crypto/blst_src/asm/add_mod_384-x86_64.pl
@@ -0,0 +1,1500 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 384 bits add
+my @acc=map("%r$_",(8..15, "ax", "bx", "bp"));
+   push(@acc, $a_ptr);
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,\@function,4,"unwind"
+.align	32
+add_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__add_mod_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__add_mod_384_a_is_loaded:
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,\@function,4,"unwind"
+.align	32
+add_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$24, %rsp
+.cfi_adjust_cfa_offset	24
+.cfi_end_prologue
+
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	48($b_org), $b_org	# b->im
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__add_mod_384		# add_mod_384(ret->im, a->im, b->im, mod);
+
+	mov	8*0(%rsp), $a_ptr	# a->re
+	mov	8*1(%rsp), $b_org	# b->re
+	lea	-48($r_ptr), $r_ptr	# ret->re
+	call	__add_mod_384		# add_mod_384(ret->re, a->re, b->re, mod);
+
+	mov	24+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	24+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	24+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	24+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	24+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	24+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384x,.-add_mod_384x
+
+########################################################################
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,\@function,4,"unwind"
+.align	32
+rshift_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+.Loop_rshift_mod_384:
+	call	__rshift_mod_384
+	dec	%edx
+	jnz	.Loop_rshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,\@abi-omnipotent
+.align	32
+__rshift_mod_384:
+	mov	\$1, @acc[11]
+	mov	8*0($n_ptr), @acc[6]
+	and	@acc[0], @acc[11]
+	mov	8*1($n_ptr), @acc[7]
+	neg	@acc[11]
+	mov	8*2($n_ptr), @acc[8]
+	and	@acc[11], @acc[6]
+	mov	8*3($n_ptr), @acc[9]
+	and	@acc[11], @acc[7]
+	mov	8*4($n_ptr), @acc[10]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], @acc[10]
+	adc	@acc[5], @acc[11]
+	sbb	@acc[5], @acc[5]
+
+	shr	\$1, @acc[6]
+	mov	@acc[7], @acc[0]
+	shr	\$1, @acc[7]
+	mov	@acc[8], @acc[1]
+	shr	\$1, @acc[8]
+	mov	@acc[9], @acc[2]
+	shr	\$1, @acc[9]
+	mov	@acc[10], @acc[3]
+	shr	\$1, @acc[10]
+	mov	@acc[11], @acc[4]
+	shr	\$1, @acc[11]
+	shl	\$63, @acc[0]
+	shl	\$63, @acc[1]
+	or	@acc[6], @acc[0]
+	shl	\$63, @acc[2]
+	or	@acc[7], @acc[1]
+	shl	\$63, @acc[3]
+	or	@acc[8], @acc[2]
+	shl	\$63, @acc[4]
+	or	@acc[9], @acc[3]
+	shl	\$63, @acc[5]
+	or	@acc[10], @acc[4]
+	or	@acc[11], @acc[5]
+
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,\@function,3,"unwind"
+.align	32
+div_by_2_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	$b_org, $n_ptr
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	call	__rshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+########################################################################
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,\@function,4,"unwind"
+.align	32
+lshift_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+.Loop_lshift_mod_384:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$r_ptr, $r_ptr
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	(%rsp), $r_ptr
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	cmovc	@acc[9],  @acc[3]
+	cmovc	@acc[10], @acc[4]
+	cmovc	@acc[11], @acc[5]
+
+	dec	%edx
+	jnz	.Loop_lshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,\@abi-omnipotent
+.align	32
+__lshift_mod_384:
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	cmovc	@acc[9],  @acc[3]
+	cmovc	@acc[10], @acc[4]
+	cmovc	@acc[11], @acc[5]
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+########################################################################
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+
+	mov	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,\@function,3,"unwind"
+.align	32
+mul_by_8_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+########################################################################
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_3_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+
+	mov	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	(%rsp), $a_ptr
+	lea	8*6($r_ptr), $r_ptr
+
+	mov	8*6($a_ptr), @acc[0]
+	mov	8*7($a_ptr), @acc[1]
+	mov	8*8($a_ptr), @acc[2]
+	mov	8*9($a_ptr), @acc[3]
+	mov	8*10($a_ptr), @acc[4]
+	mov	8*11($a_ptr), @acc[5]
+
+	call	__lshift_mod_384
+
+	mov	\$8*6, $b_org
+	add	(%rsp), $b_org
+	call	__add_mod_384_a_is_loaded
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_8_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	(%rsp), $a_ptr
+	mov	@acc[0], 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mov	48+8*0($a_ptr), @acc[0]
+	mov	48+8*1($a_ptr), @acc[1]
+	mov	48+8*2($a_ptr), @acc[2]
+	mov	48+8*3($a_ptr), @acc[3]
+	mov	48+8*4($a_ptr), @acc[4]
+	mov	48+8*5($a_ptr), @acc[5]
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	@acc[0], 48+8*0($r_ptr)
+	mov	@acc[1], 48+8*1($r_ptr)
+	mov	@acc[2], 48+8*2($r_ptr)
+	mov	@acc[3], 48+8*3($r_ptr)
+	mov	@acc[4], 48+8*4($r_ptr)
+	mov	@acc[5], 48+8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+########################################################################
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,\@function,4,"unwind"
+.align	32
+cneg_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$b_org			# condition flag
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), $b_org	# load a[0:5]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	$b_org, @acc[0]
+	mov	8*3($a_ptr), @acc[3]
+	or	@acc[1], $b_org
+	mov	8*4($a_ptr), @acc[4]
+	or	@acc[2], $b_org
+	mov	8*5($a_ptr), @acc[5]
+	or	@acc[3], $b_org
+	mov	\$-1, @acc[11]
+	or	@acc[4], $b_org
+	or	@acc[5], $b_org
+
+	mov	8*0($n_ptr), @acc[6]	# load n[0:5]
+	cmovnz	@acc[11], $b_org	# mask = a[0:5] ? -1 : 0
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	and	$b_org, @acc[6]		# n[0:5] &= mask
+	mov	8*3($n_ptr), @acc[9]
+	and	$b_org, @acc[7]
+	mov	8*4($n_ptr), @acc[10]
+	and	$b_org, @acc[8]
+	mov	8*5($n_ptr), @acc[11]
+	and	$b_org, @acc[9]
+	mov	0(%rsp), $n_ptr		# restore condition flag
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	sub	@acc[0], @acc[6]	# a[0:5] ? n[0:5]-a[0:5] : 0-0
+	sbb	@acc[1], @acc[7]
+	sbb	@acc[2], @acc[8]
+	sbb	@acc[3], @acc[9]
+	sbb	@acc[4], @acc[10]
+	sbb	@acc[5], @acc[11]
+
+	or	$n_ptr, $n_ptr		# check condition flag
+
+	cmovz	@acc[0], @acc[6]	# flag ? n[0:5]-a[0:5] : a[0:5]
+	cmovz	@acc[1], @acc[7]
+	cmovz	@acc[2], @acc[8]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovz	@acc[3], @acc[9]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovz	@acc[4], @acc[10]
+	mov	@acc[8], 8*2($r_ptr)
+	cmovz	@acc[5], @acc[11]
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	cneg_mod_384,.-cneg_mod_384
+
+########################################################################
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,\@function,4,"unwind"
+.align	32
+sub_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sub_mod_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,\@function,4,"unwind"
+.align	32
+sub_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$24, %rsp
+.cfi_adjust_cfa_offset	24
+.cfi_end_prologue
+
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	48($b_org), $b_org	# b->im
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__sub_mod_384		# sub_mod_384(ret->im, a->im, b->im, mod);
+
+	mov	8*0(%rsp), $a_ptr	# a->re
+	mov	8*1(%rsp), $b_org	# b->re
+	lea	-48($r_ptr), $r_ptr	# ret->re
+	call	__sub_mod_384		# sub_mod_384(ret->re, a->re, b->re, mod);
+
+	mov	24+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	24+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	24+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	24+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	24+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	24+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384x,.-sub_mod_384x
+___
+}
+{ ###################################################### ret = a * (1 + i)
+my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx");
+my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp"));
+
+$code.=<<___;
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,\@function,3,"unwind"
+.align	32
+mul_by_1_plus_i_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$56, %rsp
+.cfi_adjust_cfa_offset	56
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+	mov	$r_ptr, 8*6(%rsp)	# offload r_ptr
+	sbb	$r_ptr, $r_ptr
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$a_ptr, $a_ptr
+
+	mov	@acc[0], 8*0(%rsp)	# offload a->re + a->im [without carry]
+	 mov	8*0($n_ptr), @acc[0]
+	mov	@acc[1], 8*1(%rsp)
+	 mov	8*1($n_ptr), @acc[1]
+	mov	@acc[2], 8*2(%rsp)
+	 mov	8*2($n_ptr), @acc[2]
+	mov	@acc[3], 8*3(%rsp)
+	 mov	8*3($n_ptr), @acc[3]
+	mov	@acc[4], 8*4(%rsp)
+	 and	$a_ptr, @acc[0]
+	 mov	8*4($n_ptr), @acc[4]
+	mov	@acc[5], 8*5(%rsp)
+	 and	$a_ptr, @acc[1]
+	 mov	8*5($n_ptr), @acc[5]
+	 and	$a_ptr, @acc[2]
+	 and	$a_ptr, @acc[3]
+	 and	$a_ptr, @acc[4]
+	 and	$a_ptr, @acc[5]
+	mov	8*6(%rsp), $a_ptr	# restore r_ptr
+
+	add	@acc[0], @acc[6]
+	 mov	8*0(%rsp), @acc[0]	# restore a->re + a->im
+	adc	@acc[1], @acc[7]
+	 mov	8*1(%rsp), @acc[1]
+	adc	@acc[2], @acc[8]
+	 mov	8*2(%rsp), @acc[2]
+	adc	@acc[3], @acc[9]
+	 mov	8*3(%rsp), @acc[3]
+	adc	@acc[4], @acc[10]
+	 mov	8*4(%rsp), @acc[4]
+	adc	@acc[5], @acc[11]
+	 mov	8*5(%rsp), @acc[5]
+
+	mov	@acc[6], 8*0($a_ptr)	# ret->re = a->re - a->im
+	 mov	@acc[0], @acc[6]
+	mov	@acc[7], 8*1($a_ptr)
+	mov	@acc[8], 8*2($a_ptr)
+	 mov	@acc[1], @acc[7]
+	mov	@acc[9], 8*3($a_ptr)
+	mov	@acc[10], 8*4($a_ptr)
+	 mov	@acc[2], @acc[8]
+	mov	@acc[11], 8*5($a_ptr)
+
+	sub	8*0($n_ptr), @acc[0]
+	 mov	@acc[3], @acc[9]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	 mov	@acc[4], @acc[10]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	cmovc	@acc[6], @acc[0]
+	cmovc	@acc[7], @acc[1]
+	cmovc	@acc[8], @acc[2]
+	mov	@acc[0], 8*6($a_ptr)	# ret->im = a->re + a->im
+	cmovc	@acc[9], @acc[3]
+	mov	@acc[1], 8*7($a_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*8($a_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*9($a_ptr)
+	mov	@acc[4], 8*10($a_ptr)
+	mov	@acc[5], 8*11($a_ptr)
+
+	mov	56+8*0(%rsp),%r15
+.cfi_restore	%r15
+	mov	56+8*1(%rsp),%r14
+.cfi_restore	%r14
+	mov	56+8*2(%rsp),%r13
+.cfi_restore	%r13
+	mov	56+8*3(%rsp),%r12
+.cfi_restore	%r12
+	mov	56+8*4(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	56+8*5(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56+8*6(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+___
+}
+{ ######################################################
+my ($r_ptr,$n_ptr) = ("%rdi","%rsi");
+my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp"));
+
+$code.=<<___;
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,\@function,2,"unwind"
+.align	32
+sgn0_pty_mod_384:
+.cfi_startproc
+.cfi_end_prologue
+	mov	8*0($r_ptr), @acc[0]
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+
+	xor	%rax, %rax
+	mov	@acc[0], $r_ptr
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, $r_ptr
+	and	\$2, %rax
+	or	$r_ptr, %rax		# pack sign and parity
+
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,\@function,2,"unwind"
+.align	32
+sgn0_pty_mod_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*6($r_ptr), @acc[0]	# sgn0(a->im)
+	mov	8*7($r_ptr), @acc[1]
+	mov	8*8($r_ptr), @acc[2]
+	mov	8*9($r_ptr), @acc[3]
+	mov	8*10($r_ptr), @acc[4]
+	mov	8*11($r_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), %rax		# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	mov	8*0(%rax), @acc[0]
+	mov	8*1(%rax), @acc[1]
+	mov	8*2(%rax), @acc[2]
+	mov	8*3(%rax), @acc[3]
+	mov	8*4(%rax), @acc[4]
+	mov	8*5(%rax), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp), %rbx
+.cfi_restore	%rbx
+	mov	16(%rsp), %rbp
+.cfi_restore	%rbp
+	lea	24(%rsp), %rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+___
+}
+if (0) {
+my $inp = $win64 ? "%rcx" : "%rdi";
+$code.=<<___;
+.globl	nbits_384
+.hidden	nbits_384
+.type	nbits_384,\@abi-omnipotent
+.align	32
+nbits_384:
+	mov	8*5($inp), %r8
+	mov	8*4($inp), %r9
+	mov	8*3($inp), %r10
+	mov	8*2($inp), %r11
+	mov	\$-1, %rdx
+	mov	\$127, %eax
+	bsr	%r8, %r8
+	cmovnz	%rdx,%r9
+	cmovz	%rax,%r8
+	bsr	%r9, %r9
+	cmovnz	%rdx,%r10
+	cmovz	%rax,%r9
+	xor	\$63,%r8
+	bsr	%r10, %r10
+	cmovnz	%rdx, %r11
+	cmovz	%rax, %r10
+	xor	\$63,%r9
+	add	%r8, %r9
+	mov	8*1($inp), %r8
+	bsr	%r11, %r11
+	cmovnz	%rdx, %r8
+	cmovz	%rax, %r11
+	xor	\$63, %r10
+	add	%r9, %r10
+	mov	8*0($inp), %r9
+	bsr	%r8, %r8
+	cmovnz	%rdx, %r9
+	cmovz	%rax, %r8
+	xor	\$63, %r11
+	add	%r10, %r11
+	bsr	%r9, %r9
+	cmovz	%rax, %r9
+	xor	\$63, %r8
+	add	%r11, %r8
+	xor	\$63, %r9
+	add	%r8, %r9
+	mov	\$384, %eax
+	sub	%r9, %rax
+	ret
+.size	nbits_384,.-nbits_384
+___
+}
+
+if (1) {
+my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d")
+                                          : ("%rdi", "%rsi", "%rdx", "%ecx");
+
+sub vec_select {
+my $sz = shift;
+my $half = $sz/2;
+my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3));
+
+$code.=<<___;
+.globl	vec_select_$sz
+.hidden	vec_select_$sz
+.type	vec_select_$sz,\@abi-omnipotent
+.align	32
+vec_select_$sz:
+	movd	$select, %xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	\$0,%xmm5,%xmm5		# broadcast
+	movdqu	($inp1),$xmm0
+	lea	$half($inp1),$inp1
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	($inp2),$xmm1
+	lea	$half($inp2),$inp2
+	pcmpeqd	%xmm5,%xmm4
+	lea	$half($out),$out
+___
+for($i=0; $i<$sz-16; $i+=16) {
+$code.=<<___;
+	pand	%xmm4,$xmm0
+	movdqu	$i+16-$half($inp1),$xmm2
+	pand	%xmm5,$xmm1
+	movdqu	$i+16-$half($inp2),$xmm3
+	por	$xmm1,$xmm0
+	movdqu	$xmm0,$i-$half($out)
+___
+	($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1);
+}
+$code.=<<___;
+	pand	%xmm4,$xmm0
+	pand	%xmm5,$xmm1
+	por	$xmm1,$xmm0
+	movdqu	$xmm0,$i-$half($out)
+	ret
+.size	vec_select_$sz,.-vec_select_$sz
+___
+}
+vec_select(32);
+vec_select(48);
+vec_select(96);
+vec_select(192);
+vec_select(144);
+vec_select(288);
+}
+
+{
+my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi");
+
+$code.=<<___;
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,\@abi-omnipotent
+.align	32
+vec_prefetch:
+	leaq		-1($inp,$end), $end
+	mov		\$64, %rax
+	xor		%r8, %r8
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	cmova		%r8, %rax
+	prefetchnta	($inp)
+	lea		($inp,%rax), $inp
+	cmp		$end, $inp
+	cmova		$end, $inp
+	prefetchnta	($inp)
+	ret
+.size	vec_prefetch,.-vec_prefetch
+___
+my $len = $win64 ? "%edx" : "%esi";
+
+$code.=<<___;
+.globl	vec_is_zero_16x
+.hidden	vec_is_zero_16x
+.type	vec_is_zero_16x,\@abi-omnipotent
+.align	32
+vec_is_zero_16x:
+	shr		\$4, $len
+	movdqu		($inp), %xmm0
+	lea		16($inp), $inp
+
+.Loop_is_zero:
+	dec		$len
+	jz		.Loop_is_zero_done
+	movdqu		($inp), %xmm1
+	lea		16($inp), $inp
+	por		%xmm1, %xmm0
+	jmp		.Loop_is_zero
+
+.Loop_is_zero_done:
+	pshufd		\$0x4e, %xmm0, %xmm1
+	por		%xmm1, %xmm0
+	movq		%xmm0, %rax
+	inc		$len			# now it's 1
+	test		%rax, %rax
+	cmovnz		$len, %eax
+	xor		\$1, %eax
+	ret
+.size	vec_is_zero_16x,.-vec_is_zero_16x
+___
+}
+{
+my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d")
+                                 : ("%rdi", "%rsi", "%edx");
+$code.=<<___;
+.globl	vec_is_equal_16x
+.hidden	vec_is_equal_16x
+.type	vec_is_equal_16x,\@abi-omnipotent
+.align	32
+vec_is_equal_16x:
+	shr		\$4, $len
+	movdqu		($inp1), %xmm0
+	movdqu		($inp2), %xmm1
+	sub		$inp1, $inp2
+	lea		16($inp1), $inp1
+	pxor		%xmm1, %xmm0
+
+.Loop_is_equal:
+	dec		$len
+	jz		.Loop_is_equal_done
+	movdqu		($inp1), %xmm1
+	movdqu		($inp1,$inp2), %xmm2
+	lea		16($inp1), $inp1
+	pxor		%xmm2, %xmm1
+	por		%xmm1, %xmm0
+	jmp		.Loop_is_equal
+
+.Loop_is_equal_done:
+	pshufd		\$0x4e, %xmm0, %xmm1
+	por		%xmm1, %xmm0
+	movq		%xmm0, %rax
+	inc		$len			# now it's 1
+	test		%rax, %rax
+	cmovnz		$len, %eax
+	xor		\$1, %eax
+	ret
+.size	vec_is_equal_16x,.-vec_is_equal_16x
+___
+}
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl
new file mode 100755
index 00000000000..6ee3cf8760a
--- /dev/null
+++ b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl
@@ -0,0 +1,260 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+############################################################ 384x384 add/sub
+# Double-width addition/subtraction modulo n<<384, as opposite to
+# naively expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+{
+my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+.type	__add_mod_384x384,\@abi-omnipotent
+.align	32
+__add_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	add	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	adc	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	adc	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	adc	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	adc	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	adc	8*6($b_org), @acc[6]
+	 mov	@acc[1], 8*1($r_ptr)
+	adc	8*7($b_org), @acc[7]
+	 mov	@acc[2], 8*2($r_ptr)
+	adc	8*8($b_org), @acc[8]
+	 mov	@acc[4], 8*4($r_ptr)
+	 mov	@acc[6], @acc[0]
+	adc	8*9($b_org), @acc[9]
+	 mov	@acc[3], 8*3($r_ptr)
+	 mov	@acc[7], @acc[1]
+	adc	8*10($b_org), @acc[10]
+	 mov	@acc[5], 8*5($r_ptr)
+	 mov	@acc[8], @acc[2]
+	adc	8*11($b_org), @acc[11]
+	 mov	@acc[9], @acc[3]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	 mov	@acc[10], @acc[4]
+	sbb	8*2($n_ptr), @acc[8]
+	sbb	8*3($n_ptr), @acc[9]
+	sbb	8*4($n_ptr), @acc[10]
+	 mov	@acc[11], @acc[5]
+	sbb	8*5($n_ptr), @acc[11]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[0], @acc[6]
+	cmovc	@acc[1], @acc[7]
+	cmovc	@acc[2], @acc[8]
+	mov	@acc[6], 8*6($r_ptr)
+	cmovc	@acc[3], @acc[9]
+	mov	@acc[7], 8*7($r_ptr)
+	cmovc	@acc[4], @acc[10]
+	mov	@acc[8], 8*8($r_ptr)
+	cmovc	@acc[5], @acc[11]
+	mov	@acc[9], 8*9($r_ptr)
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,\@function,4,"unwind"
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__add_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,\@function,4,"unwind"
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sub_mod_384x384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sub_mod_384x384,.-sub_mod_384x384
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/arm-xlate.pl b/crypto/blst_src/asm/arm-xlate.pl
new file mode 100755
index 00000000000..35aab37407b
--- /dev/null
+++ b/crypto/blst_src/asm/arm-xlate.pl
@@ -0,0 +1,386 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ARM assembler distiller/adapter by \@dot-asm.
+
+use strict;
+
+################################################################
+# Recognized "flavour"-s are:
+#
+# linux[32|64]	GNU assembler, effectively pass-through
+# ios[32|64]	global symbols' decorations, PIC tweaks, etc.
+# win[32|64]	Visual Studio armasm-specific directives
+# coff[32|64]	e.g. clang --target=arm-windows ...
+#
+my $flavour = shift;
+   $flavour = "linux" if (!$flavour or $flavour eq "void");
+
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+my %GLOBALS;
+my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
+my $in_proc;	# used with 'windows' flavour
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
+my $fpu  = sub { } if ($flavour !~ /linux/);       # omit .fpu
+
+my $rodata = sub {
+    SWITCH: for ($flavour) {
+	/linux/		&& return ".section\t.rodata";
+	/ios/		&& return ".section\t__TEXT,__const";
+	/coff/		&& return ".section\t.rdata,\"dr\"";
+	/win/		&& return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
+	last;
+    }
+};
+
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+} if ($flavour !~ /linux/);
+
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0\n";
+	$ret .= ".previous";
+	$name = "_$name";
+    } elsif ($flavour =~ /win/) {
+	$ret = "\tCOMMON\t|$name|,@args[1]";
+    } elsif ($flavour =~ /coff/) {
+	$ret = ".comm\t$name,@args[1]";
+    } else {
+	$ret = ".comm\t".join(',',@args);
+    }
+
+    $$global = $name;
+    $ret;
+};
+
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name"; last; };
+	/win/		&& do { $ret = ""; last; };
+    }
+
+    $ret = ".globl	$name" if (!defined($ret));
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+
+my $extern = sub {
+    &$globl(@_);
+    if ($flavour =~ /win/) {
+	return "\tEXTERN\t@_";
+    }
+    return;	# return nothing
+};
+
+my $type = sub {
+    my $arg = join(',',@_);
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios32/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = "#ifdef __thumb2__\n" .
+					   ".thumb_func	$1\n" .
+					   "#endif";
+				}
+				last;
+			      };
+	/win/		&& do { if ($arg =~ /(\w+),\s*%(function|object)/) {
+				    my $type = "[DATA]";
+				    if ($2 eq "function") {
+					$in_proc = $1;
+					$type = "[FUNC]";
+				    }
+				    $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
+							: "";
+				}
+				last;
+			      };
+	/coff/		&& do { if ($arg =~ /(\w+),\s*%function/) {
+				    $ret = ".def	$1;\n".
+					   ".type	32;\n".
+					   ".endef";
+				}
+				last;
+			      };
+    }
+    return $ret;
+} if ($flavour !~ /linux/);
+
+my $size = sub {
+    if ($in_proc && $flavour =~ /win/) {
+	$in_proc = undef;
+	return "\tENDP";
+    }
+} if ($flavour !~ /linux/);
+
+my $inst = sub {
+    if ($flavour =~ /win/)	{ "\tDCDU\t".join(',',@_); }
+    else			{ ".long\t".join(',',@_);  }
+} if ($flavour !~ /linux/);
+
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	if ($flavour =~ /win/) {
+	    "\tDCB\t$line,0\n\tALIGN\t4";
+	} else {
+	    ".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";
+	}
+    } else {	"";	}
+};
+
+my $align = sub {
+    "\tALIGN\t".2**@_[0];
+} if ($flavour =~ /win/);
+   $align = sub {
+    ".p2align\t".@_[0];
+} if ($flavour =~ /coff/);
+
+my $byte = sub {
+    "\tDCB\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $short = sub {
+    "\tDCWU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $word = sub {
+    "\tDCDU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $long = $word if ($flavour =~ /win/);
+
+my $quad = sub {
+    "\tDCQU\t".join(',',@_);
+} if ($flavour =~ /win/);
+
+my $skip = sub {
+    "\tSPACE\t".shift;
+} if ($flavour =~ /win/);
+
+my $code = sub {
+    "\tCODE@_[0]";
+} if ($flavour =~ /win/);
+
+my $thumb = sub {	# .thumb should appear prior .text in source
+    "# define ARM THUMB\n" .
+    "\tTHUMB";
+} if ($flavour =~ /win/);
+
+my $text = sub {
+    "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
+} if ($flavour =~ /win/);
+
+my $syntax = sub {} if ($flavour =~ /win/);	# omit .syntax
+
+my $rva = sub {
+    # .rva directive comes in handy only on 32-bit Windows, i.e. it can
+    # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
+    # However! Corresponding compilers don't seem to bet on PIC, which
+    # raises the question why would assembler programmer have to jump
+    # through the hoops? But just in case, it would go as following:
+    #
+    #	ldr	r1,.LOPENSSL_armcap
+    #	ldr	r2,.LOPENSSL_armcap+4
+    #	adr	r0,.LOPENSSL_armcap
+    #	bic	r1,r1,#1		; de-thumb-ify link.exe's ideas
+    #	sub	r0,r0,r1		; r0 is image base now
+    #	ldr	r0,[r0,r2]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.rva	.LOPENSSL_armcap	; self-reference
+    #	.rva	OPENSSL_armcap_P	; real target
+    #
+    # Non-position-independent [and ISA-neutral] alternative is so much
+    # simpler:
+    #
+    #	ldr	r0,.LOPENSSL_armcap
+    #	ldr	r0,[r0]
+    #	...
+    #.LOPENSSL_armcap:
+    #	.long	OPENSSL_armcap_P
+    #
+    "\tDCDU\t@_[0]\n\tRELOC\t2"
+} if ($flavour =~ /win(?!64)/);
+
+################################################################
+# some broken instructions in Visual Studio armasm[64]...
+
+my $it = sub {} if ($flavour =~ /win32/);	# omit 'it'
+
+my $ext = sub {
+    "\text8\t".join(',',@_);
+} if ($flavour =~ /win64/);
+
+my $csel = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsel$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+my $csetm = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    my @regs = split(m|,\s*|,$args);
+    my $cond = pop(@regs);
+
+    "\tcsetm$cond\t".join(',',@regs);
+} if ($flavour =~ /win64/);
+
+# ... then conditional branch instructions are also broken, but
+# maintaining all the variants is tedious, so I kludge-fix it
+# elsewhere...
+################################################################
+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+my $paciasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
+                          : &$inst(0xd503233f);
+};
+
+my $autiasp = sub {
+    ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
+                          : &$inst(0xd50323bf);
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    if ($flavour =~ /win/) {
+	# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
+	$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
+	# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
+	$line =~ s/\.(L\w{2,})/|\$$1|/g;
+	# omit "#:lo12:" on win64
+	$line =~ s/#:lo12://;
+    } elsif ($flavour =~ /coff(?!64)/) {
+	$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
+    } elsif ($flavour =~ /ios64/) {
+	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
+    if ($flavour =~ /64/) {
+	# "vX.Md[N]" -> "vX.d[N]
+	$line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/;
+    }
+
+    return $line;
+}
+
+while(my $line=<>) {
+
+    # fix up assembler-specific commentary delimiter
+    $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
+
+    if ($line =~ m/^\s*(#|@|;|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    $label = ($GLOBALS{$label} or $label);
+	    if ($flavour =~ /win/) {
+		$label =~ s|^\.L(?=\w)|\$L|;
+		printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
+	    } else {
+		$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
+		printf "%s:", $label;
+	    }
+	}
+    }
+
+    if ($line !~ m/^[#@;]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+	    $line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+	    if ($flavour =~ /win64/) {
+		# "b.cond" -> "bcond", kludge-fix:-(
+		$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
+	    }
+	    $line = $c.$mnemonic;
+	    $line.= "\t$arg" if ($arg ne "");
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+print "\tEND\n" if ($flavour =~ /win/);
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
new file mode 100755
index 00000000000..ced8c6c37e9
--- /dev/null
+++ b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
@@ -0,0 +1,586 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
+# on Cortex-A57.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(4..11));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
+my $cnt = $n_ptr;
+my @t = map("x$_",(19..26));
+my ($a_lo, $b_lo) = @acc[3,7];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	paciasp
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	str	$out_ptr, [sp]
+
+	ldp	@acc[4], @acc[5], [$n_ptr,#8*0]
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*2]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]	// copy modulus to |b|
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	$f0,[$out_ptr,#8*8]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	$f0, [$out_ptr,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	@acc[4], [$in_ptr,#8*8]		// |u|
+	ldr	@acc[5], [$in_ptr,#8*13]	// |v|
+	madd	@acc[0], $f_, @acc[4], xzr	// |u|*|f0|
+	madd	@acc[0], $g_, @acc[5], @acc[0]	// |v|*|g0|
+	str	@acc[0], [$out_ptr,#8*4]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*5]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*7]
+
+	madd	@acc[0], $f0, @acc[4], xzr	// |u|*|f1|
+	madd	@acc[0], $g0, @acc[5], @acc[0]	// |v|*|g1|
+	str	@acc[0], [$out_ptr,#8*9]
+	asr	@acc[1], @acc[0], #63		// sign extenstion
+	stp	@acc[1], @acc[1], [$out_ptr,#8*10]
+	stp	@acc[1], @acc[1], [$out_ptr,#8*12]
+___
+for($i=2; $i<15; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	@t[3], @t[3], @t[4]
+	str	@t[3], [$out_ptr,#8*4]
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+___
+$code.=<<___	if ($i>7);
+	bl	__smul_512x63_tail
+___
+$code.=<<___	if ($i<=7);
+	adc	@t[3], @t[3], @t[4]
+	stp	@t[3], @t[3], [$out_ptr,#8*4]
+	stp	@t[3], @t[3], [$out_ptr,#8*6]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// two[!] last iterations
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	ldr	$b_lo, [$in_ptr,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldr	$out_ptr, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	@t[1], @acc[3], $g_		// figure out top-most limb
+	ldp	@acc[4], @acc[5], [$nx_ptr,#8*0]
+	adc	@t[4], @t[4], @t[6]
+	ldp	@acc[6], @acc[7], [$nx_ptr,#8*2]
+
+	add	@t[1], @t[1], @t[4]		// @t[1] is 1, 0 or -1
+	asr	@t[0], @t[1], #63		// sign as mask
+
+	and	@t[4],   @acc[4], @t[0]		// add mod<<256 conditionally
+	and	@t[5],   @acc[5], @t[0]
+	adds	@acc[0], @acc[0], @t[4]
+	and	@t[6],   @acc[6], @t[0]
+	adcs	@acc[1], @acc[1], @t[5]
+	and	@t[7],   @acc[7], @t[0]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @t[3],   @t[7]
+	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1
+
+	neg	@t[0], @t[1]
+	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
+	asr	@t[0], @t[0], #63		// excess bit as mask
+
+	and	@acc[4], @acc[4], @t[1]		// mask |mod|
+	and	@acc[5], @acc[5], @t[1]
+	and	@acc[6], @acc[6], @t[1]
+	and	@acc[7], @acc[7], @t[1]
+
+	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
+	eor	@acc[5], @acc[5], @t[0]
+	adds	@acc[4], @acc[4], @t[0], lsr#63
+	eor	@acc[6], @acc[6], @t[0]
+	adcs	@acc[5], @acc[5], xzr
+	eor	@acc[7], @acc[7], @t[0]
+	adcs	@acc[6], @acc[6], xzr
+	adc	@acc[7], @acc[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adc	@acc[3], @acc[3], @acc[7]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*6]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	autiasp
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[4..7]  if ($j);
+my $k = 8*8+8*5*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldr	@t[3+$j], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@t[3+$j], @t[3+$j], $f1
+	 umulh	@t[0], @acc[0], $f_
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@t[3+$j], @t[3+$j], xzr
+	 umulh	@t[2], @acc[2], $f_
+___
+$code.=<<___	if ($j!=0);
+	adc	$g1, xzr, xzr		// used in __smul_512x63_tail
+___
+$code.=<<___;
+	mul	@acc[0], @acc[0], $f_
+	 cmp	$f_, #0
+	mul	@acc[1], @acc[1], $f_
+	 csel	@t[3+$j], @t[3+$j], xzr, ne
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@t[5+$j], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[5+$j], @t[5+$j], @t[2]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[2], @t[5], [$out_ptr,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	@t[5], @acc[3], $f_
+	ldp	@acc[1], @acc[2], [$in_ptr,#8*18]	// load rest of |v|
+	adc	@t[7], @t[7], xzr
+	ldr	@acc[3], [$in_ptr,#8*20]
+	and	@t[3], @t[3], $f_
+
+	umulh	@acc[7], @acc[7], $g_	// resume |v|*|g1| chain
+
+	sub	@t[5], @t[5], @t[3]	// tie up |u|*|f1| chain
+	asr	@t[6], @t[5], #63
+
+	eor	@acc[1], @acc[1], $f1	// conditionally negate rest of |v|
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[1], @acc[1], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[0], @t[4],   $g_
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $g_
+	add	@acc[7], @acc[7], @t[7]
+	 umulh	@t[2], @acc[2], $g_
+
+	mul	@acc[0], @t[4],   $g_
+	mul	@acc[1], @acc[1], $g_
+	adds	@acc[0], @acc[0], @acc[7]
+	mul	@acc[2], @acc[2], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@t[3],   @acc[3], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@t[3],   @t[3],   @t[2]
+	adc	@t[4], xzr, xzr		// used in the final step
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
+	adcs	@t[3],   @t[3],   @t[6]	// carry is used in the final step
+	stp	@acc[2], @t[3],   [$out_ptr,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[4..7]   if ($j);
+my $k = 8*4*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[5], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[6], $f0, @t[5]	// conditionally negate |f0| (or |g0|)
+
+	eor	@acc[0], @acc[0], @t[5]	// conditionally negate |a| (or |b|)
+	sub	@t[6], @t[6], @t[5]
+	eor	@acc[1], @acc[1], @t[5]
+	adds	@acc[0], @acc[0], @t[5], lsr#63
+	eor	@acc[2], @acc[2], @t[5]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[5]
+	 umulh	@t[0], @acc[0], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	 umulh	@t[1], @acc[1], @t[6]
+	adc	@acc[3], @acc[3], xzr
+	 umulh	@t[2], @acc[2], @t[6]
+	and	@t[5], @t[5], @t[6]
+	 umulh	@t[3+$j], @acc[3], @t[6]
+	neg	@t[5], @t[5]
+
+	mul	@acc[0], @acc[0], @t[6]
+	mul	@acc[1], @acc[1], @t[6]
+	mul	@acc[2], @acc[2], @t[6]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], @t[1]
+	adcs	@acc[3], @acc[3], @t[2]
+	adc	@t[3+$j], @t[3+$j], @t[5]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[4]
+	adcs	@acc[1], @acc[1], @acc[5]
+	adcs	@acc[2], @acc[2], @acc[6]
+	adcs	@acc[3], @acc[3], @acc[7]
+	adc	@acc[4], @t[3],   @t[4]
+
+	extr	@acc[0], @acc[1], @acc[0], #31
+	extr	@acc[1], @acc[2], @acc[1], #31
+	extr	@acc[2], @acc[3], @acc[2], #31
+	asr	@t[4], @acc[4], #63	// result's sign as mask
+	extr	@acc[3], @acc[4], @acc[3], #31
+
+	eor	@acc[0], @acc[0], @t[4]	// ensure the result is positive
+	eor	@acc[1], @acc[1], @t[4]
+	adds	@acc[0], @acc[0], @t[4], lsr#63
+	eor	@acc[2], @acc[2], @t[4]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[4]
+	adcs	@acc[2], @acc[2], xzr
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adc	@acc[3], @acc[3], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+
+	eor	$f0, $f0, @t[4]		// adjust |f/g| accordingly
+	eor	$g0, $g0, @t[4]
+	sub	$f0, $f0, @t[4]
+	sub	$g0, $g0, @t[4]
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+___
+
+{
+my @a = @acc[0..3];
+my @b = @acc[4..7];
+my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
+
+$code.=<<___;
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*6]
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	@t[0], @a[3], @b[3]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[1], ne
+	orr	@t[0], @a[3], @b[3]	// and ones before top-most, ...
+	csel	@b[2], @b[2], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	csel	@a[2], @a[2], @a[0], ne
+	orr	@t[0], @a[3], @b[3]	// and one more, ...
+	csel	@b[2], @b[2], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[3], @a[3], @a[2], ne
+	csel	@b[3], @b[3], @b[2], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[3], @a[3], @t[0]	// align high limbs to the left
+	lslv	@b[3], @b[3], @t[0]
+	lsrv	@a[2], @a[2], @t[1]
+	lsrv	@b[2], @b[2], @t[1]
+	and	@a[2], @a[2], @t[1], asr#6
+	and	@b[2], @b[2], @t[1], asr#6
+	orr	$a_lo, @a[3], @a[2]
+	orr	$b_lo, @b[3], @b[2]
+
+	bfxil	$a_lo, @a[0], #0, #31
+	bfxil	$b_lo, @b[0], #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	$cnt, #31
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $fg1
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,    hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0],   hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+	cbnz	$cnt, .Loop_31_256
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_lo, @t[3]
+	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
+	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	@t[1], $g0
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	lsr	$a_lo, $a_lo, #1
+	and	@t[0], $f1, @t[3]
+	and	@t[1], $g1, @t[3]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+foreach(split("\n",$code)) {
+    s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
+    print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
new file mode 100755
index 00000000000..24ab5452930
--- /dev/null
+++ b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
@@ -0,0 +1,837 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
+#
+# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+#                                                       const vec256 modx);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_256(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 512 // k - 1):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_256_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_512x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 512 % k + k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 512 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    mod <<= 512 - mod.bit_length()  # align to the left
+    if v < 0:
+        v += mod
+    if v < 0:
+        v += mod
+    elif v == 1<<512
+        v -= mod
+
+    return v & (2**512 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15));
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edx";
+
+$frame = 8*6+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*6+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr,  8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+
+	mov	8*0($n_ptr), @acc[4]	# load modulus
+	mov	8*1($n_ptr), @acc[5]
+	mov	8*2($n_ptr), @acc[6]
+	mov	8*3($n_ptr), @acc[7]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+
+	mov	@acc[4], 8*4(%rax)	# copy modulus to |b|
+	mov	@acc[5], 8*5(%rax)
+	mov	@acc[6], 8*6(%rax)
+	mov	@acc[7], 8*7(%rax)
+	mov	%rax, $in_ptr
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*0(%rsp)		# corrected |f0|
+	#mov	$g0, 8*1(%rsp)		# corrected |g0|
+	mov	$f0, 8*8($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+	mov	$f0, 8*9($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	#mov	$f0, 8*2(%rsp)		# corrected |f1|
+	#mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*8($in_ptr),  @acc[0]	# |u|
+	mov	8*13($in_ptr), @acc[4]	# |v|
+	mov	@acc[0], @acc[1]
+	imulq	8*0(%rsp), @acc[0]	# |u|*|f0|
+	mov	@acc[4], @acc[5]
+	imulq	8*1(%rsp), @acc[4]	# |v|*|g0|
+	add	@acc[4], @acc[0]
+	mov	@acc[0], 8*4($out_ptr)	# destination |u|
+	sar	\$63, @acc[0]		# sign extension
+	mov	@acc[0], 8*5($out_ptr)
+	mov	@acc[0], 8*6($out_ptr)
+	mov	@acc[0], 8*7($out_ptr)
+	mov	@acc[0], 8*8($out_ptr)
+	lea	8*8($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	imulq	$f0, @acc[1]		# |u|*|f1|
+	imulq	$g0, @acc[5]		# |v|*|g1|
+	add	@acc[5], @acc[1]
+	mov	@acc[1], 8*9($out_ptr)	# destination |v|
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	mov	@acc[1], 8*12($out_ptr)
+	mov	@acc[1], 8*13($out_ptr)
+___
+for($i=2; $i<15; $i++) {
+my $smul_512x63  = $i>8  ? "__smulq_512x63"
+                         : "__smulq_256x63";
+$code.=<<___;
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	mov	$f1, 8*2(%rsp)
+	mov	$g1, 8*3(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*0(%rsp)		# corrected |f0|
+	mov	$g0, 8*1(%rsp)		# corrected |g0|
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_256_n_shift_by_31
+	mov	$f0, 8*2(%rsp)		# corrected |f1|
+	mov	$g0, 8*3(%rsp)		# corrected |g1|
+
+	mov	8*0(%rsp), $f0		# |f0|
+	mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*4($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_256x63
+
+	mov	8*2(%rsp), $f0		# |f1|
+	mov	8*3(%rsp), $g0		# |g1|
+	lea	8*5($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_512x63
+___
+$code.=<<___	if ($i==8);
+	sar	\$63, %rbp		# sign extension
+	mov	%rbp, 8*5($out_ptr)
+	mov	%rbp, 8*6($out_ptr)
+	mov	%rbp, 8*7($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$47, $cnt		# 31 + 512 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*4($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62_256
+	#mov	$f0, 8*0(%rsp)
+	#mov	$g0, 8*1(%rsp)
+	#mov	$f1, 8*2(%rsp)
+	#mov	$g1, 8*3(%rsp)
+
+	#mov	8*0(%rsp), $f0		# |f0|
+	#mov	8*1(%rsp), $g0		# |g0|
+	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_256x63
+
+	#mov	8*2(%rsp), $f0		# |f1|
+	#mov	8*3(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original |out_ptr|
+	call	__smulq_512x63
+	adc	%rbp, %rdx		# the excess limb of the result
+
+	mov	8*5(%rsp), $in_ptr	# original |nx_ptr|
+	mov	%rdx, %rax
+	sar	\$63, %rdx		# result's sign as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	add	@acc[0], @acc[4]	# conditionally add |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+	adc	\$0,     %rax
+
+	mov	%rax, %rdx
+	neg	%rax
+	or	%rax, %rdx		# excess bit or sign as mask
+	sar	\$63, %rax		# excess bit as mask
+
+	mov	%rdx, @acc[0]		# mask |modulus|
+	mov	%rdx, @acc[1]
+	and	8*0($in_ptr), @acc[0]
+	mov	%rdx, @acc[2]
+	and	8*1($in_ptr), @acc[1]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |modulus|
+	xor	%rcx, %rcx
+	xor	%rax, @acc[1]
+	sub	%rax, %rcx
+	xor	%rax, @acc[2]
+	xor	%rax, %rdx
+	add	%rcx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, %rdx
+
+	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
+	adc	@acc[1], @acc[5]
+	adc	@acc[2], @acc[6]
+	adc	%rdx,    @acc[7]
+
+	mov	@acc[4], 8*4($out_ptr)	# store absolute value
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+$code.=<<___;
+.type	__smulq_512x63,\@abi-omnipotent
+.align	32
+__smulq_512x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), %rbp	# sign limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, %rbp
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, %rbp
+
+	mulq	%rbx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	@acc[$i], 8*$i($out_ptr)
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, %rbp
+	neg	%rbp
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, %rbp
+	mov	@acc[3], 8*3($out_ptr)
+
+	mov	8*5($in_ptr), @acc[0]	# load |v|
+	mov	8*6($in_ptr), @acc[1]
+	mov	8*7($in_ptr), @acc[2]
+	mov	8*8($in_ptr), @acc[3]
+	mov	8*9($in_ptr), @acc[4]
+	mov	8*10($in_ptr), @acc[5]
+	mov	8*11($in_ptr), @acc[6]
+	mov	8*12($in_ptr), @acc[7]
+
+	mov	$g0, $f0
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $g0		# conditionally negate |g0|
+	add	%rax, $g0
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+
+	mulq	$g0
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<7; $i++) {
+$code.=<<___;
+	mulq	$g0
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	imulq	$g0
+	add	%rax, @acc[7]
+	adc	\$0, %rdx		# used in the final step
+
+	mov	%rbp, %rbx
+	sar	\$63, %rbp		# sign extension
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	%rbx, @acc[4]
+	adc	%rbp, @acc[5]
+	adc	%rbp, @acc[6]
+	adc	%rbp, @acc[7]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+	mov	@acc[6], 8*6($out_ptr)
+	mov	@acc[7], 8*7($out_ptr)
+
+	ret
+.size	__smulq_512x63,.-__smulq_512x63
+
+.type	__smulq_256x63,\@abi-omnipotent
+.align	32
+__smulq_256x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*5*$j;
+my @acc=@acc;	@acc=@acc[4..7]	if($j);
+my $top="%rbp";	$top=$g0	if($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), $top	# sign/excess limb
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0|
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, $top
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, $top
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	and	%rbx, $top
+	neg	$top
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $top
+___
+$code.=<<___	if ($j==0);
+	mov	$g0, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]	# accumulate |u|*|f0|
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	%rcx, %rbp
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	%rbp,    8*4($out_ptr)
+
+	ret
+.size	__smulq_256x63,.-__smulq_256x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulq_256_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulq_256_n_shift_by_31:
+	mov	$f0, 8*0($out_ptr)	# offload |f0|
+	mov	$g0, 8*1($out_ptr)	# offload |g0|
+	mov	$f0, %rbp
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*4*$j;
+my @acc=@acc;	@acc=@acc[4..7] if ($j);
+my $f0="%rbp";	$f0=$g0		if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+
+	mov	$f0, %rbx
+	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$f0, %rbx		# conditionally negate |f0| (or |g0|)
+	add	%rax, %rbx
+
+	xor	$f0, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mulq	%rbx
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	and	%rbx, $f0
+	neg	$f0
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<3; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[3]
+	adc	%rdx, $f0
+___
+}
+$code.=<<___;
+	add	@acc[4], @acc[0]
+	adc	@acc[5], @acc[1]
+	adc	@acc[6], @acc[2]
+	adc	@acc[7], @acc[3]
+	adc	$g0, %rbp
+
+	mov	8*0($out_ptr), $f0	# restore original |f0|
+	mov	8*1($out_ptr), $g0	# restore original |g0|
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, %rbp,    @acc[3]
+
+	sar	\$63, %rbp		# sign as mask
+	xor	%rax, %rax
+	sub	%rbp, %rax		# sign as bit
+
+	xor	%rbp, @acc[0]		# conditionally negate the result
+	xor	%rbp, @acc[1]
+	xor	%rbp, @acc[2]
+	xor	%rbp, @acc[3]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+
+	xor	%rbp, $f0		# conditionally negate |f0|
+	xor	%rbp, $g0		# conditionally negate |g0|
+	add	%rax, $f0
+	add	%rax, $g0
+
+	ret
+.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
+___
+}
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31_256,\@abi-omnipotent
+.align	32
+__ab_approximation_31_256:
+	mov	8*3($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*7($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*2($in_ptr), @a[1]
+	mov	8*6($in_ptr), @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*5($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*4($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	not	%rax
+	and	%rax, @a[2]
+	and	%rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31_256
+
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+___
+}
+$code.=<<___;
+.type	__inner_loop_31_256,\@abi-omnipotent
+.align	32			# comment and punish Coffee Lake by up to 40%
+__inner_loop_31_256:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31_256:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31_256
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256,\@abi-omnipotent
+.align	32
+__inner_loop_62_256:
+	mov	$cnt, %r15d
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	$f0, $g1	# |g1|=1
+	mov	$f0, %r14
+
+.Loop_62_256:
+	xor	$t0, $t0
+	test	%r14, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	%r14, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, %r15d
+	jnz	.Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
new file mode 100755
index 00000000000..268bf9d2546
--- /dev/null
+++ b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
@@ -0,0 +1,610 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smul_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smul_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
+my @acc=map("x$_",(3..14));
+my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
+my $cnt = $n_ptr;
+my @t = map("x$_",(22..28,2));
+my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
+
+$frame = 16+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@t[0],   @acc[1], [$in_ptr,#8*0]
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4]
+
+	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-512	// in the frame...
+	stp	$out_ptr, $nx_ptr, [sp]
+
+	ldp	@acc[6], @acc[7], [$n_ptr,#8*0]
+	ldp	@acc[8], @acc[9], [$n_ptr,#8*2]
+	ldp	@acc[10], @acc[11], [$n_ptr,#8*4]
+
+	stp	@t[0],   @acc[1], [$in_ptr,#8*0]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*4]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*6]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*8]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	$cnt, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	$f0,[$out_ptr,#8*12]		// initialize |u| with |f0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	$f0, [$out_ptr,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	@acc[4], [$in_ptr,#8*12]	// |u|
+	ldr	@acc[5], [$in_ptr,#8*18]	// |v|
+	mul	@acc[0], $f_, @acc[4]		// |u|*|f0|
+	smulh	@acc[1], $f_, @acc[4]
+	mul	@acc[2], $g_, @acc[5]		// |v|*|g0|
+	smulh	@acc[3], $g_, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*8]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*10]
+
+	mul	@acc[0], $f0, @acc[4]		// |u|*|f1|
+	smulh	@acc[1], $f0, @acc[4]
+	mul	@acc[2], $g0, @acc[5]		// |v|*|g1|
+	smulh	@acc[3], $g0, @acc[5]
+	adds	@acc[0], @acc[0], @acc[2]
+	adc	@acc[1], @acc[1], @acc[3]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*12]
+	asr	@acc[2], @acc[1], #63		// sign extenstion
+	stp	@acc[2], @acc[2], [$out_ptr,#8*14]
+	stp	@acc[2], @acc[2], [$out_ptr,#8*16]
+___
+for($i=2; $i<11; $i++) {
+$code.=<<___;
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	bl	__ab_approximation_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	$f_, $f0			// corrected |f0|
+	mov	$g_, $g0			// corrected |g0|
+
+	mov	$f0, $f1			// |f1|
+	mov	$g0, $g1			// |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// corrected |f1|
+	mov	$g_, $g0			// corrected |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+___
+$code.=<<___	if ($i>5);
+	bl	__smul_767x63_tail
+___
+$code.=<<___	if ($i==5);
+	asr	@t[5], @t[5], #63		// sign extension
+	stp	@t[5], @t[5], [$out_ptr,#8*6]
+	stp	@t[5], @t[5], [$out_ptr,#8*8]
+	stp	@t[5], @t[5], [$out_ptr,#8*10]
+___
+}
+$code.=<<___;
+	////////////////////////////////////////// iteration before last
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	$a_lo, $a_hi, [$in_ptr,#8*0]	// just load
+	ldp	$b_lo, $b_hi, [$in_ptr,#8*6]
+	bl	__inner_loop_62
+
+	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
+	str	$a_lo, [$out_ptr,#8*0]
+	str	$b_lo, [$out_ptr,#8*6]
+
+	mov	$f_, $f0			// exact |f0|
+	mov	$g_, $g0			// exact |g0|
+	mov	$f0, $f1
+	mov	$g0, $g1
+	add	$out_ptr, $out_ptr, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	$f_, $f0			// exact |f1|
+	mov	$g_, $g0			// exact |g1|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
+	mov	$cnt, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	$a_lo, [$in_ptr,#8*0]		// just load
+	eor	$a_hi, $a_hi, $a_hi
+	ldr	$b_lo, [$in_ptr,#8*6]
+	eor	$b_hi, $b_hi, $b_hi
+	bl	__inner_loop_62
+
+	mov	$f_, $f1
+	mov	$g_, $g1
+	ldp	$out_ptr, $f0, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	@t[0], @acc[5], #63		// sign as mask
+	ldp	@acc[6], @acc[7], [$f0,#8*0]
+	ldp	@acc[8], @acc[9], [$f0,#8*2]
+	ldp	@acc[10], @acc[11], [$f0,#8*4]
+
+	and	@acc[6], @acc[6], @t[0]		// add mod<<384 conditionally
+	and	@acc[7], @acc[7], @t[0]
+	adds	@acc[0], @acc[0], @acc[6]
+	and	@acc[8], @acc[8], @t[0]
+	adcs	@acc[1], @acc[1], @acc[7]
+	and	@acc[9], @acc[9], @t[0]
+	adcs	@acc[2], @acc[2], @acc[8]
+	and	@acc[10], @acc[10], @t[0]
+	adcs	@acc[3], @acc[3], @acc[9]
+	and	@acc[11], @acc[11], @t[0]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @acc[11]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $f_ = $f_;   $f_ = $g_          if ($j);
+my @acc = @acc; @acc = @acc[6..11] if ($j);
+my $k = 8*12+8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
+	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
+	sub	$f_, $f_, $f1
+	eor	@acc[1], @acc[1], $f1
+	adds	@acc[0], @acc[0], $f1, lsr#63
+	eor	@acc[2], @acc[2], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $f_
+	eor	@acc[5], @acc[5], $f1
+	 umulh	@t[1], @acc[1], $f_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[2], @acc[2], $f_
+	adcs	@acc[5], @acc[5], xzr
+	 umulh	@t[3], @acc[3], $f_
+___
+$code.=<<___	if ($j);
+	adc	$g1, xzr, xzr		// used in __smul_767x63_tail
+___
+$code.=<<___;
+	umulh	@t[4], @acc[4], $f_
+	mul	@acc[0], @acc[0], $f_
+	mul	@acc[1], @acc[1], $f_
+	mul	@acc[2], @acc[2], $f_
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $f_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $f_
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@t[5+$j],@acc[5], $f_
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@t[5+$j],@t[5+$j],@t[4]
+___
+$code.=<<___	if ($j==0);
+	adc	@t[7], xzr, xzr
+___
+}
+$code.=<<___;
+	adc	@t[7], @t[7], xzr
+
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], @acc[10]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adcs	@t[5],   @t[5],   @t[6]
+	stp	@acc[4], @t[5],   [$out_ptr,#8*4]
+	adc	@t[6],   @t[7],   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	@t[5],   @acc[5], $f_
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*24]	// load rest of |v|
+	umulh	@acc[11],@acc[11], $g_
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*26]
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*28]
+
+	eor	@acc[0], @acc[0], $f1	// conditionally negate rest of |v|
+	eor	@acc[1], @acc[1], $f1
+	eor	@acc[2], @acc[2], $f1
+	adds	@acc[0], @acc[0], $g1
+	eor	@acc[3], @acc[3], $f1
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[4], @acc[4], $f1
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[5], @acc[5], $f1
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[0], @acc[0], $g_
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[1], @acc[1], $g_
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[2], @acc[2], $g_
+	 add	@acc[11], @acc[11], @t[6]
+	umulh	@t[3], @acc[3], $g_
+	 asr	@t[6], @t[5], #63
+	umulh	@t[4], @acc[4], $g_
+	mul	@acc[0], @acc[0], $g_
+	mul	@acc[1], @acc[1], $g_
+	mul	@acc[2], @acc[2], $g_
+	adds	@acc[0], @acc[0], @acc[11]
+	mul	@acc[3], @acc[3], $g_
+	adcs	@acc[1], @acc[1], @t[0]
+	mul	@acc[4], @acc[4], $g_
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[5], @acc[5], $g_
+	adcs	@acc[3], @acc[3], @t[2]
+	adcs	@acc[4], @acc[4], @t[3]
+	adc	@acc[5], @acc[5], @t[4]
+
+	adds	@acc[0], @acc[0], @t[5]
+	adcs	@acc[1], @acc[1], @t[6]
+	adcs	@acc[2], @acc[2], @t[6]
+	adcs	@acc[3], @acc[3], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
+	adcs	@acc[4], @acc[4], @t[6]
+	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
+	adc	@acc[5], @acc[5], @t[6]
+	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+___
+for($j=0; $j<2; $j++) {
+my $f0 = $f0;   $f0 = $g0           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
+	asr	@t[6], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	@t[7], $f0, @t[6]	// conditionally negate |f0| (or |g0|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |a| (or |b|)
+	sub	@t[7], @t[7], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], @t[7]
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], @t[7]
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], @t[7]
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], @t[7]
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], @t[7]
+	smulh	@t[5+$j], @acc[5], @t[7]
+	mul	@acc[0], @acc[0], @t[7]
+	mul	@acc[1], @acc[1], @t[7]
+	mul	@acc[2], @acc[2], @t[7]
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], @t[7]
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], @t[7]
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], @t[7]
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], xzr
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #62
+	extr	@acc[1], @acc[2], @acc[1], #62
+	extr	@acc[2], @acc[3], @acc[2], #62
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #62
+	extr	@acc[4], @acc[5], @acc[4], #62
+	extr	@acc[5], @acc[6], @acc[5], #62
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	eor	$f0, $f0, @t[6]
+	eor	$g0, $g0, @t[6]
+	sub	$f0, $f0, @t[6]
+	sub	$g0, $g0, @t[6]
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+
+$code.=<<___;
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	@a[4], @a[5], [$in_ptr,#8*4]
+	ldp	@b[4], @b[5], [$in_ptr,#8*10]
+	ldp	@a[2], @a[3], [$in_ptr,#8*2]
+	ldp	@b[2], @b[3], [$in_ptr,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	ldp	@a[0], @a[1], [$in_ptr,#8*0]
+	ldp	@b[0], @b[1], [$in_ptr,#8*6]
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[1], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	@a[5], @a[5], @a[4]
+	orr	@b[5], @b[5], @b[4]
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	$f0, #1		// |f0|=1
+	mov	$g0, #0		// |g0|=0
+	mov	$f1, #0		// |f1|=0
+	mov	$g1, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	@t[6], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	$cnt, $cnt, #1
+	subs	@t[2], $b_lo, $a_lo	// |b_|-|a_|
+	and	@t[0], $b_lo, @t[6]
+	sbc	@t[3], $b_hi, $a_hi
+	and	@t[1], $b_hi, @t[6]
+	subs	@t[4], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	@t[0], $f0
+	sbcs	@t[5], $a_hi, @t[1]
+	mov	@t[1], $g0
+	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
+	csel	$b_hi, $b_hi, $a_hi, hs
+	csel	$a_lo, @t[4], @t[2], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$a_hi, @t[5], @t[3], hs
+	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
+	csel	$f1, $f1, @t[0],     hs
+	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
+	csel	$g1, $g1, @t[1],     hs
+	extr	$a_lo, $a_hi, $a_lo, #1
+	lsr	$a_hi, $a_hi, #1
+	and	@t[0], $f1, @t[6]
+	and	@t[1], $g1, @t[6]
+	add	$f1, $f1, $f1		// |f1|<<=1
+	add	$g1, $g1, $g1		// |g1|<<=1
+	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	$cnt, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
new file mode 100755
index 00000000000..4128dc3236d
--- /dev/null
+++ b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
@@ -0,0 +1,401 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >12x better [on
+# Cortex cores] than modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
+my @acc=map("x$_",(3..14));
+my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
+my @t = map("x$_",(21..28));
+my ($a_, $b_) = @acc[5,11];
+
+$frame = 2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	paciasp
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #$frame
+
+	ldp	@acc[0], @acc[1], [x0,#8*0]		// load input
+	ldp	@acc[2], @acc[3], [x0,#8*2]
+	ldp	@acc[4], @acc[5], [x0,#8*4]
+
+	add	$in_ptr, sp, #255	// find closest 256-byte-aligned spot
+	and	$in_ptr, $in_ptr, #-256	// in the frame...
+
+	ldp	@acc[6], @acc[7], [x1,#8*0]		// load modulus
+	ldp	@acc[8], @acc[9], [x1,#8*2]
+	ldp	@acc[10], @acc[11], [x1,#8*4]
+
+	stp	@acc[0], @acc[1], [$in_ptr,#8*6]	// copy input to |a|
+	stp	@acc[2], @acc[3], [$in_ptr,#8*8]
+	stp	@acc[4], @acc[5], [$in_ptr,#8*10]
+	stp	@acc[6], @acc[7], [$in_ptr,#8*0]	// copy modulus to |b|
+	stp	@acc[8], @acc[9], [$in_ptr,#8*2]
+	stp	@acc[10], @acc[11], [$in_ptr,#8*4]
+
+	eor	$L, $L, $L			// init the Legendre symbol
+	mov	$cnt, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	$cnt, $cnt, #1
+
+	eor	$out_ptr, $in_ptr, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	$f1, $f0			// |f0|
+	mov	$g1, $g0			// |g0|
+	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	@acc[6], @acc[7], [$out_ptr,#-8*6]
+	eor	$in_ptr, $in_ptr, #128		// flip-flop src |a|b|
+	and	@t[6], @t[6], @acc[6]		// if |a| was negative,
+	add	$L, $L, @t[6], lsr#1		// adjust |L|
+
+	cbnz	$cnt, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	$a_, [$in_ptr,#8*6]		// and loaded
+	//ldr	$b_, [$in_ptr,#8*0]
+	mov	$cnt, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, $L, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #$frame
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	autiasp
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+my $fx = $g1;   $fx = $f1           if ($j);
+my @acc = @acc; @acc = @acc[6..11]  if ($j);
+my $k = 8*6*$j;
+$code.=<<___;
+	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |b| (or |a|)
+	asr	@t[6], $fx, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
+	eor	$fx, $fx, @t[6]		// conditionally negate |g1| (or |f1|)
+	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
+
+	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |b| (or |a|)
+	sub	$fx, $fx, @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	 umulh	@t[0], @acc[0], $fx
+	adcs	@acc[3], @acc[3], xzr
+	 umulh	@t[1], @acc[1], $fx
+	eor	@acc[5], @acc[5], @t[6]
+	 umulh	@t[2], @acc[2], $fx
+	adcs	@acc[4], @acc[4], xzr
+	 umulh	@t[3], @acc[3], $fx
+	adc	@acc[5], @acc[5], xzr
+
+	umulh	@t[4], @acc[4], $fx
+	and	@t[7], $fx, @t[6]
+	umulh	@t[5+$j], @acc[5], $fx
+	neg	@t[7], @t[7]
+	mul	@acc[0], @acc[0], $fx
+	mul	@acc[1], @acc[1], $fx
+	mul	@acc[2], @acc[2], $fx
+	adds	@acc[1], @acc[1], @t[0]
+	mul	@acc[3], @acc[3], $fx
+	adcs	@acc[2], @acc[2], @t[1]
+	mul	@acc[4], @acc[4], $fx
+	adcs	@acc[3], @acc[3], @t[2]
+	mul	@acc[5], @acc[5], $fx
+	adcs	@acc[4], @acc[4], @t[3]
+	adcs	@acc[5], @acc[5] ,@t[4]
+	adc	@t[5+$j], @t[5+$j], @t[7]
+___
+}
+$code.=<<___;
+	adds	@acc[0], @acc[0], @acc[6]
+	adcs	@acc[1], @acc[1], @acc[7]
+	adcs	@acc[2], @acc[2], @acc[8]
+	adcs	@acc[3], @acc[3], @acc[9]
+	adcs	@acc[4], @acc[4], @acc[10]
+	adcs	@acc[5], @acc[5], @acc[11]
+	adc	@acc[6], @t[5],   @t[6]
+
+	extr	@acc[0], @acc[1], @acc[0], #30
+	extr	@acc[1], @acc[2], @acc[1], #30
+	extr	@acc[2], @acc[3], @acc[2], #30
+	asr	@t[6], @acc[6], #63
+	extr	@acc[3], @acc[4], @acc[3], #30
+	extr	@acc[4], @acc[5], @acc[4], #30
+	extr	@acc[5], @acc[6], @acc[5], #30
+
+	eor	@acc[0], @acc[0], @t[6]
+	eor	@acc[1], @acc[1], @t[6]
+	adds	@acc[0], @acc[0], @t[6], lsr#63
+	eor	@acc[2], @acc[2], @t[6]
+	adcs	@acc[1], @acc[1], xzr
+	eor	@acc[3], @acc[3], @t[6]
+	adcs	@acc[2], @acc[2], xzr
+	eor	@acc[4], @acc[4], @t[6]
+	adcs	@acc[3], @acc[3], xzr
+	eor	@acc[5], @acc[5], @t[6]
+	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
+	adcs	@acc[4], @acc[4], xzr
+	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
+	adc	@acc[5], @acc[5], xzr
+	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+___
+
+{
+my @a = @acc[0..5];
+my @b = @acc[6..11];
+my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
+
+$code.=<<___;
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	@b[4], @b[5], [$in_ptr,#8*4]	// |a| is still in registers
+	ldp	@b[2], @b[3], [$in_ptr,#8*2]
+
+	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[3], ne
+	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
+	csel	@b[4], @b[4], @b[3], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[2], ne
+	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
+	csel	@b[4], @b[4], @b[2], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[1], ne
+	orr	@t[0], @a[5], @b[5]	// and one more, ...
+	csel	@b[4], @b[4], @b[1], ne
+
+	cmp	@t[0], #0
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	csel	@a[4], @a[4], @a[0], ne
+	orr	@t[0], @a[5], @b[5]
+	csel	@b[4], @b[4], @b[0], ne
+
+	clz	@t[0], @t[0]
+	cmp	@t[0], #64
+	csel	@t[0], @t[0], xzr, ne
+	csel	@a[5], @a[5], @a[4], ne
+	csel	@b[5], @b[5], @b[4], ne
+	neg	@t[1], @t[0]
+
+	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
+	lslv	@b[5], @b[5], @t[0]
+	lsrv	@a[4], @a[4], @t[1]
+	lsrv	@b[4], @b[4], @t[1]
+	and	@a[4], @a[4], @t[1], asr#6
+	and	@b[4], @b[4], @t[1], asr#6
+	orr	$a_, @a[5], @a[4]
+	orr	$b_, @b[5], @b[4]
+
+	bfxil	$a_, @a[0], #0, #32
+	bfxil	$b_, @b[0], #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	$cnt, #30
+	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	$bias,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1	// L + (a_ & b_) >> 1
+	mov	@t[0], $fg1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	$fg1, $fg1, $fg0,  hs	// exchange |fg0| and |fg1|
+	csel	$fg0, $fg0, @t[0], hs
+	 csel	$L,   $L,   @t[4], hs
+	lsr	$a_, $a_, #1
+	and	@t[0], $fg1, @t[3]
+	and	@t[1], $bias, @t[3]
+	 add	$t[2], $b_, #2
+	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	$fg1, $fg1, $fg1	// |f1|<<=1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	$fg0, $fg0, @t[1]
+	sub	$fg1, $fg1, $bias
+
+	cbnz	$cnt, .Loop_30
+
+	mov	$bias, #0x7FFFFFFF
+	ubfx	$f0, $fg0, #0, #32
+	ubfx	$g0, $fg0, #32, #32
+	ubfx	$f1, $fg1, #0, #32
+	ubfx	$g1, $fg1, #32, #32
+	sub	$f0, $f0, $bias		// remove the bias
+	sub	$g0, $g0, $bias
+	sub	$f1, $f1, $bias
+	sub	$g1, $g1, $bias
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+___
+}
+
+{
+my ($a_, $b_) = (@acc[0], @acc[6]);
+$code.=<<___;
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
+	 and	@t[4], $a_, $b_
+	sub	$cnt, $cnt, #1
+	and	@t[0], $b_, @t[3]
+	sub	@t[1], $b_, $a_		// |b_|-|a_|
+	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	 add	@t[4], $L, @t[4], lsr#1
+	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
+	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	 csel	$L,   $L,   @t[4], hs
+	 add	$t[2], $b_, #2
+	lsr	$a_, $a_, #1
+	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	$cnt, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
new file mode 100755
index 00000000000..40016ed70d2
--- /dev/null
+++ b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
@@ -0,0 +1,494 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast quadratic residue test as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific Legendre symbol addition chain...
+#
+# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_is_square_mod_384(inp, mod):
+    a = inp
+    b = mod
+    L = 0   # only least significant bit, adding 1 makes up for sign change
+
+    k = 30
+    w = 32
+    mask = (1 << w) - 1
+
+    for i in range(0, 768 // k - 1):
+        # __ab_approximation_30
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_30
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+        # __smulq_384_n_shift_by_30
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if b < 0:
+            b = -b
+        if a < 0:
+            a = -a
+            L += (b % 4) >> 1           # |b| is always odd, the second bit
+                                        # tells the whole story
+
+    if True:
+        for j in range(0, 768 % k + k):
+            if a & 1:
+                if a < b:
+                    a, b = b, a
+                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
+                                        # tell the whole story
+                a = a-b
+            a = a >> 1
+            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
+
+    return (L & 1) ^ 1
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
+my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
+my @acc=map("%r$_",(8..15));
+my $L = "%rbp";
+
+$frame = 8*3+2*256;
+
+$code.=<<___;
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,\@function,2,"unwind"
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*3+255(%rsp), %rax	# find closest 256-byte-aligned spot
+	and	\$-256, %rax		# in the frame...
+
+	mov	8*0(%rdi), @acc[0]	# load input
+	mov	8*1(%rdi), @acc[1]
+	mov	8*2(%rdi), @acc[2]
+	mov	8*3(%rdi), @acc[3]
+	mov	8*4(%rdi), @acc[4]
+	mov	8*5(%rdi), @acc[5]
+
+	mov	8*0(%rsi), @acc[6]	# load modulus
+	mov	8*1(%rsi), @acc[7]
+	mov	8*2(%rsi), %rbx
+	mov	8*3(%rsi), %rcx
+	mov	8*4(%rsi), %rdx
+	mov	8*5(%rsi), %rdi
+	mov	%rax, $in_ptr		# pointer to source |a|b|
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	%rbx,    8*8(%rax)
+	mov	%rcx,    8*9(%rax)
+	mov	%rdx,    8*10(%rax)
+	mov	%rdi,    8*11(%rax)
+
+	xor	$L, $L			# initialize the Legendre symbol
+	mov	\$24, %ecx		# 24 is 768/30-1
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	mov	%ecx, 8*2(%rsp)		# offload loop counter
+
+	call	__ab_approximation_30
+	mov	$f0, 8*0(%rsp)		# offload |f0| and |g0|
+	mov	$g0, 8*1(%rsp)
+
+	mov	\$128+8*6, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |b|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*0(%rsp), $f1		# pop |f0| and |g0|
+	mov	8*1(%rsp), $g1
+	lea	-8*6($out_ptr),$out_ptr	# pointer to destination |a|
+	call	__smulq_384_n_shift_by_30
+
+	mov	8*2(%rsp), %ecx		# re-load loop counter
+	xor	\$128, $in_ptr		# flip-flop pointer to source |a|b|
+
+	and	8*6($out_ptr), @acc[6]	# if |a| was negative, adjust |L|
+	shr	\$1, @acc[6]
+	add	@acc[6], $L
+
+	sub	\$1, %ecx
+	jnz	.Loop_is_square
+
+	################################# last iteration
+	#call	__ab_approximation_30	# |a| and |b| are exact, just load
+	#mov	8*0($in_ptr), @acc[0]	# |a_|
+	mov	8*6($in_ptr), @acc[1]	# |b_|
+	call	__inner_loop_48		# 48 is 768%30+30
+
+	mov	\$1, %rax
+	and	$L,  %rax
+	xor	\$1, %rax		# return value
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,\@abi-omnipotent
+.align	32
+__smulq_384_n_shift_by_30:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rbx		# |f1| (or |g1|)
+	sar	\$63, %rdx		# |f1|'s sign as mask (or |g1|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f1|'s sign as bit (or |g1|'s)
+
+	xor	%rdx, %rbx		# conditionally negate |f1| (or |g1|)
+	add	%rax, %rbx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	%rdx, @acc[6+$j]
+	and	%rbx, @acc[6+$j]
+	mulq	%rbx			# |a|*|f1| (or |b|*|g1|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	%rbx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	neg	@acc[6+$j]
+	mulq	%rbx
+	add	%rax, @acc[5]
+	adc	%rdx, @acc[6+$j]
+___
+$code.=<<___	if ($j==0);
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	$g1, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	@acc[7],       @acc[6]
+
+	shrd	\$30, @acc[1], @acc[0]
+	shrd	\$30, @acc[2], @acc[1]
+	shrd	\$30, @acc[3], @acc[2]
+	shrd	\$30, @acc[4], @acc[3]
+	shrd	\$30, @acc[5], @acc[4]
+	shrd	\$30, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	%rbx, %rbx
+	sub	@acc[6], %rbx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	%rbx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+___
+{
+my ($a_, $b_) = @acc[0..1];
+my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
+my $cnt = "%edi";
+{
+my @a = @acc[0..5];
+my @b = (@a[1..3], $t4, $t5, $g0);
+
+$code.=<<___;
+.type	__ab_approximation_30,\@abi-omnipotent
+.align	32
+__ab_approximation_30:
+	mov	8*11($in_ptr), @b[5]	# load |b| in reverse order
+	mov	8*10($in_ptr), @b[4]
+	mov	8*9($in_ptr),  @b[3]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# check top-most limbs, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[3], @a[4]
+	mov	8*8($in_ptr), @b[2]
+	cmovz	@b[3], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... ones before top-most, ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[2], @a[4]
+	mov	8*7($in_ptr), @b[1]
+	cmovz	@b[2], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[1], @a[4]
+	mov	8*6($in_ptr), @b[0]
+	cmovz	@b[1], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax		# ... and ones before that ...
+	cmovz	@a[4], @a[5]
+	cmovz	@b[4], @b[5]
+	cmovz	@a[0], @a[4]
+	cmovz	@b[0], @b[4]
+
+	mov	@a[5], %rax
+	or	@b[5], %rax
+	bsr	%rax, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[5]
+	cmovz	@b[0], @b[5]
+	cmovz	%rax, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[4], @a[5]	# align second limb to the left
+	shldq	%cl, @b[4], @b[5]
+
+	mov	\$0xFFFFFFFF00000000, %rax
+	mov	@a[0]d, ${a_}d
+	mov	@b[0]d, ${b_}d
+	and	%rax, @a[5]
+	and	%rax, @b[5]
+	or	@a[5], ${a_}
+	or	@b[5], ${b_}
+
+	jmp	__inner_loop_30
+
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+___
+}
+$code.=<<___;
+.type	__inner_loop_30,\@abi-omnipotent
+.align	32
+__inner_loop_30:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	lea	-1($fg0), $bias			# 0x7FFFFFFF7FFFFFFF
+	mov	\$30, $cnt
+
+.Loop_30:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax		# pre-"negate" |L|
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	 mov	$L,   $t4
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+	cmovz	$t4, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	 lea	(%rax,$L), $L		# "negate" |L| if |b|%8 is 3 or 5
+	sub	$bias, $fg1
+
+	sub	\$1, $cnt
+	jnz	.Loop_30
+
+	shr	\$32, $bias
+	mov	%ebx, %eax		# $fg0 -> $f0
+	shr	\$32, $g0
+	mov	%ecx, %edx		# $fg1 -> $f1
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,\@abi-omnipotent
+.align	32
+__inner_loop_48:
+	mov	\$48, $cnt		# 48 is 768%30+30
+
+.Loop_48:
+	 mov	$a_, %rax
+	 and	$b_, %rax
+	 shr	\$1, %rax		# (a_ & b_) >> 1
+
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	 lea	(%rax,$L), %rax
+	 mov	$L,  $t2
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	 cmovb	%rax, $L
+
+	sub	$b_, $a_		# |a_|-|b_|
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $L
+
+	 lea	2($b_), %rax
+	shr	\$1, $a_		# |a_|>>=1
+	 shr	\$2, %rax
+	 add	%rax, $L		# "negate" |L| if |b|%8 is 3 or 5
+
+	sub	\$1, $cnt
+	jnz	.Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
new file mode 100755
index 00000000000..2be39d8ba8b
--- /dev/null
+++ b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
@@ -0,0 +1,886 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >5x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 62
+    w = 64
+    mask = (1 << w) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_62
+        n = max(a.bit_length(), b.bit_length())
+        if n < 128:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-w)) << w)
+            b_ = (b & mask) | ((b >> (n-w)) << w)
+
+        # __inner_loop_62
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulq_383_n_shift_by_62
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulq_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ct_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr		# pointer to source |a|b|1|0|
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr),$in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<11; $i++) {
+my $smul_767x63  = $i>5 ? "__smulq_767x63"
+                        : "__smulq_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	call	__ab_approximation_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulq_383_n_shift_by_62
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==5);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# iteration before last
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$62, $cnt
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	mov	8*1($in_ptr), @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	mov	8*7($in_ptr), @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[2], 8*6($out_ptr)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*12($out_ptr),$out_ptr	# pointer to destination |u|
+	call	__smulq_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	__smulq_767x63
+
+	################################# last iteration
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$22, $cnt		# 766 % 62
+	#call	__ab_approximation_62	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	xor	@acc[3],   @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulq_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulq_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+___
+########################################################################
+# see corresponding commentary in ctx_inverse_mod_384-x86_64...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulq_767x63,\@abi-omnipotent
+.align	32
+__smulq_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $f0		# |f0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr, 8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	$f0, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |u|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0|
+	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+	mov	@acc[$i], 8*$i($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	mov	@acc[5], 8*5($out_ptr)
+	mov	%rdx, 8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	mov	$f0, $fx		# overrides in_ptr
+	sar	\$63, $f0		# |g0|'s sign as mask
+	xor	%rax, %rax
+	sub	$f0, %rax		# |g0|'s sign as bit
+
+	xor	$f0, $fx		# conditionally negate |g0|
+	add	%rax, $fx
+
+	xor	$f0, @acc[0]		# conditionally negate |v|
+	xor	$f0, @acc[1]
+	xor	$f0, @acc[2]
+	xor	$f0, @acc[3]
+	xor	$f0, @acc[4]
+	xor	$f0, @acc[5]
+	xor	$f0, @acc[6]
+	xor	$f0, @acc[7]
+	xor	$f0, @acc[8]
+	xor	$f0, @acc[9]
+	xor	$f0, @acc[10]
+	xor	$f0, @acc[11]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulq	$fx			# |v|*|g0|
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<11; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___;
+	mov	8*1(%rsp), %rdx		# out_ptr
+	imulq	$fx, %rax
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	add	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulq_767x63,.-__smulq_767x63
+___
+}
+$code.=<<___;
+.type	__smulq_383x63,\@abi-omnipotent
+.align	32
+__smulq_383x63:
+___
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |u| (or |v|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0|
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |u|*|f0| (or |v|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx, %rax
+	add	%rax, @acc[$i]
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulq_383x63,.-__smulq_383x63
+___
+{
+$code.=<<___;
+.type	__smulq_383_n_shift_by_62,\@abi-omnipotent
+.align	32
+__smulq_383_n_shift_by_62:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+$code.=<<___;
+	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	%rdx, $fx
+	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rdx, $fx		# conditionally negate |f0| (or |g0|)
+	add	%rax, $fx
+
+	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rdx, @acc[1]
+	xor	%rdx, @acc[2]
+	xor	%rdx, @acc[3]
+	xor	%rdx, @acc[4]
+	xor	%rdx, @acc[5]
+	add	@acc[0], %rax
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulq	$fx			# |a|*|f0| (or |b|*|g0|)
+	mov	%rax, @acc[0]
+	mov	@acc[1], %rax
+	mov	%rdx, @acc[1]
+___
+for($i=1; $i<5; $i++) {
+$code.=<<___;
+	mulq	$fx
+	add	%rax, @acc[$i]
+	mov	@acc[$i+1], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[$i+1]
+___
+}
+$code.=<<___	if ($j==0);
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	imulq	$fx
+	add	%rax, @acc[$i]
+	adc	\$0, %rdx
+
+	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$62, @acc[1], @acc[0]
+	shrd	\$62, @acc[2], @acc[1]
+	shrd	\$62, @acc[3], @acc[2]
+	shrd	\$62, @acc[4], @acc[3]
+	shrd	\$62, @acc[5], @acc[4]
+	shrd	\$62, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_62,\@abi-omnipotent
+.align	32
+__ab_approximation_62:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*2($in_ptr), @a[0]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*1($in_ptr), @a[0]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+	mov	8*0($in_ptr), @a[0]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	jmp	__inner_loop_62
+
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+___
+}
+$code.=<<___;
+.type	__inner_loop_62,\@abi-omnipotent
+.align	8
+.long	0
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+	mov	$in_ptr, 8(%rsp)
+
+.Loop_62:
+	xor	$t0, $t0
+	xor	$t1, $t1
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t2
+	mov	$b_hi, $t3
+	cmovnz	$b_lo, $t0
+	cmovnz	$b_hi, $t1
+	sub	$a_lo, $t2	# |b_|-|a_|
+	sbb	$a_hi, $t3
+	mov	$a_lo, $t4
+	mov	$a_hi, $t5
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	sbb	$t1, $a_hi
+	cmovc	$t2, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t3, $a_hi
+	cmovc	$t4, $b_lo	# |b_| = |a_|
+	cmovc	$t5, $b_hi
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shrd	\$1, $a_hi, $a_lo
+	shr	\$1, $a_hi
+	test	\$1, $t4	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	mov	8(%rsp), $in_ptr
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
new file mode 100755
index 00000000000..d207e2f5a7c
--- /dev/null
+++ b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Both constant-time and fast Euclidean inversion as suggested in
+# https://eprint.iacr.org/2020/972. Performance is >4x better than
+# modulus-specific FLT addition chain...
+#
+# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
+#
+$python_ref.=<<'___';
+def ct_inverse_mod_383(inp, mod):
+    a, u = inp, 1
+    b, v = mod, 0
+
+    k = 31
+    mask = (1 << k) - 1
+
+    for i in range(0, 766 // k):
+        # __ab_approximation_31
+        n = max(a.bit_length(), b.bit_length())
+        if n < 64:
+            a_, b_ = a, b
+        else:
+            a_ = (a & mask) | ((a >> (n-k-2)) << k)
+            b_ = (b & mask) | ((b >> (n-k-2)) << k)
+
+        # __inner_loop_31
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, k):
+            if a_ & 1:
+                if a_ < b_:
+                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
+                a_, f0, g0 = a_-b_, f0-f1, g0-g1
+            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
+
+        # __smulx_383_n_shift_by_31
+        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
+        if a < 0:
+            a, f0, g0 = -a, -f0, -g0
+        if b < 0:
+            b, f1, g1 = -b, -f1, -g1
+
+        # __smulx_767x63
+        u, v = u*f0 + v*g0, u*f1 + v*g1
+
+    if 766 % k:
+        f0, g0, f1, g1 = 1, 0, 0, 1
+        for j in range(0, 766 % k):
+            if a & 1:
+                if a < b:
+                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
+                a, f0, g0 = a-b, f0-f1, g0-g1
+            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
+
+        v = u*f1 + v*g1
+
+    if v < 0:
+        v += mod << (768 - mod.bit_length())    # left aligned
+
+    return v & (2**768 - 1) # to be reduced % mod
+___
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
+my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
+my $cnt = "%edi";
+
+$frame = 8*11+2*512;
+
+$code.=<<___;
+.text
+
+.globl	ctx_inverse_mod_383
+.type	ctx_inverse_mod_383,\@function,4,"unwind"
+.align	32
+ctx_inverse_mod_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
+	and	\$-512, %rax		# in the frame...
+	mov	$out_ptr, 8*4(%rsp)
+	mov	$nx_ptr, 8*5(%rsp)
+
+	mov	8*0($in_ptr), @acc[0]	# load input
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	8*0($n_ptr), @acc[6]	# load modulus
+	mov	8*1($n_ptr), @acc[7]
+	mov	8*2($n_ptr), @acc[8]
+	mov	8*3($n_ptr), @acc[9]
+	mov	8*4($n_ptr), @acc[10]
+	mov	8*5($n_ptr), @acc[11]
+
+	mov	@acc[0], 8*0(%rax)	# copy input to |a|
+	mov	@acc[1], 8*1(%rax)
+	mov	@acc[2], 8*2(%rax)
+	mov	@acc[3], 8*3(%rax)
+	mov	@acc[4], 8*4(%rax)
+	mov	@acc[5], 8*5(%rax)
+
+	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
+	mov	@acc[7], 8*7(%rax)
+	mov	@acc[8], 8*8(%rax)
+	mov	@acc[9], 8*9(%rax)
+	mov	@acc[10], 8*10(%rax)
+	mov	%rax, $in_ptr
+	mov	@acc[11], 8*11(%rax)
+
+	################################# first iteration
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*7(%rsp)		# corrected |f0|
+	#mov	$g0, 8*8(%rsp)		# corrected |g0|
+	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
+
+	################################# second iteration
+	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	__smulx_383_n_shift_by_31
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	__smulx_383_n_shift_by_31
+	#mov	$f0, 8*9(%rsp)		# corrected |f1|
+	#mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*12($in_ptr), %rax	# |u|
+	mov	8*18($in_ptr), @acc[3]	# |v|
+	mov	$f0, %rbx
+	mov	%rax, @acc[2]
+	imulq	8*7(%rsp)		# |u|*|f0|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	8*8(%rsp)		# |v|*|g0|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*6($out_ptr)	# destination |u|
+	mov	@acc[1], 8*7($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*8($out_ptr)
+	mov	@acc[1], 8*9($out_ptr)
+	mov	@acc[1], 8*10($out_ptr)
+	mov	@acc[1], 8*11($out_ptr)
+	lea	8*12($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
+
+	mov	@acc[2], %rax
+	imulq	%rbx			# |u|*|f1|
+	mov	%rax, @acc[0]
+	mov	@acc[3], %rax
+	mov	%rdx, @acc[1]
+	imulq	%rcx			# |v|*|g1|
+	add	%rax, @acc[0]
+	adc	%rdx, @acc[1]
+	mov	@acc[0], 8*12($out_ptr)	# destination |v|
+	mov	@acc[1], 8*13($out_ptr)
+	sar	\$63, @acc[1]		# sign extension
+	mov	@acc[1], 8*14($out_ptr)
+	mov	@acc[1], 8*15($out_ptr)
+	mov	@acc[1], 8*16($out_ptr)
+	mov	@acc[1], 8*17($out_ptr)
+___
+for($i=2; $i<23; $i++) {
+my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
+                         : "__smulx_191_n_shift_by_31";
+my $smul_767x63  = $i>11 ? "__smulx_767x63"
+                         : "__smulx_383x63";
+$code.=<<___;
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$31, $cnt
+	call	__ab_approximation_31
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	mov	$f1, 8*9(%rsp)
+	mov	$g1, 8*10(%rsp)
+
+	mov	\$256, $out_ptr
+	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
+	call	$smul_n_shift
+	mov	$f0, 8*7(%rsp)		# corrected |f0|
+	mov	$g0, 8*8(%rsp)		# corrected |g0|
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
+	call	$smul_n_shift
+	mov	$f0, 8*9(%rsp)		# corrected |f1|
+	mov	$g0, 8*10(%rsp)		# corrected |g1|
+
+	mov	8*7(%rsp), $f0		# |f0|
+	mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	call	__smulx_383x63
+
+	mov	8*9(%rsp), $f0		# |f1|
+	mov	8*10(%rsp), $g0		# |g1|
+	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
+	call	$smul_767x63
+___
+$code.=<<___	if ($i==11);
+	sar	\$63, @acc[5]		# sign extension
+	mov	@acc[5], 8*6($out_ptr)
+	mov	@acc[5], 8*7($out_ptr)
+	mov	@acc[5], 8*8($out_ptr)
+	mov	@acc[5], 8*9($out_ptr)
+	mov	@acc[5], 8*10($out_ptr)
+	mov	@acc[5], 8*11($out_ptr)
+___
+}
+$code.=<<___;
+	################################# two[!] last iterations in one go
+	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
+	mov	\$53, $cnt		# 31 + 766 % 31
+	#call	__ab_approximation_31	# |a| and |b| are exact, just load
+	mov	8*0($in_ptr), @acc[0]	# |a_lo|
+	#xor	@acc[1],      @acc[1]	# |a_hi|
+	mov	8*6($in_ptr), @acc[2]	# |b_lo|
+	#xor	@acc[3],      @acc[3]	# |b_hi|
+	call	__inner_loop_62
+	#mov	$f0, 8*7(%rsp)
+	#mov	$g0, 8*8(%rsp)
+	#mov	$f1, 8*9(%rsp)
+	#mov	$g1, 8*10(%rsp)
+
+	#mov	8*7(%rsp), $f0		# |f0|
+	#mov	8*8(%rsp), $g0		# |g0|
+	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
+	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
+	#call	__smulx_383x63
+
+	#mov	8*9(%rsp), $f0		# |f1|
+	#mov	8*10(%rsp), $g0		# |g1|
+	mov	$f1, $f0
+	mov	$g1, $g0
+	mov	8*4(%rsp), $out_ptr	# original out_ptr
+	call	__smulx_767x63
+
+	mov	8*5(%rsp), $in_ptr	# original n_ptr
+	mov	%rax, %rdx		# top limb of the result
+	sar	\$63, %rax		# result's sign as mask
+
+	mov	%rax, @acc[0]		# mask |modulus|
+	mov	%rax, @acc[1]
+	mov	%rax, @acc[2]
+	and	8*0($in_ptr), @acc[0]
+	and	8*1($in_ptr), @acc[1]
+	mov	%rax, @acc[3]
+	and	8*2($in_ptr), @acc[2]
+	and	8*3($in_ptr), @acc[3]
+	mov	%rax, @acc[4]
+	and	8*4($in_ptr), @acc[4]
+	and	8*5($in_ptr), %rax
+
+	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
+	adc	@acc[1], @acc[7]
+	adc	@acc[2], @acc[8]
+	adc	@acc[3], @acc[9]
+	adc	@acc[4], %rcx
+	adc	%rax,    %rdx
+
+	mov	@acc[6], 8*6($out_ptr)	# store absolute value
+	mov	@acc[7], 8*7($out_ptr)
+	mov	@acc[8], 8*8($out_ptr)
+	mov	@acc[9], 8*9($out_ptr)
+	mov	%rcx,    8*10($out_ptr)
+	mov	%rdx,    8*11($out_ptr)
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
+___
+########################################################################
+# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
+# to the maximum bit-length of the *result*, and "63" - to the maximum
+# bit-length of the |f?| and |g?| single-limb multiplicands. However!
+# The latter should not be taken literally, as they are always chosen so
+# that "bad things" don't happen. For example, there comes a point when
+# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
+# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
+# because past that point |f0| is always 1 and |g0| is always 0. And,
+# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
+# perform full-width |u|*|f1| multiplication, half-width one with sign
+# extension is sufficient...
+{
+my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
+my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
+my $fx = @acc[9];
+
+$code.=<<___;
+.type	__smulx_767x63,\@abi-omnipotent
+.align	32
+__smulx_767x63:
+	mov	8*0($in_ptr), @acc[0]	# load |u|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+
+	mov	$f0, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |f0|'s sign as bit
+
+	mov	$out_ptr, 8*1(%rsp)
+	mov	$in_ptr,  8*2(%rsp)
+	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
+
+	xor	%rax, $f0		# conditionally negate |f0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |u|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0|
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+	mov	%rdx,    8*6($out_ptr)
+	sar	\$63, %rdx		# sign extension
+	mov	%rdx, 8*7($out_ptr)
+___
+{
+my $fx=$in_ptr;
+$code.=<<___;
+	mov	$g0, $f0		# load |g0|
+	mov	$g0, %rax
+
+	mov	8*0($in_ptr), @acc[0]	# load |v|
+	mov	8*1($in_ptr), @acc[1]
+	mov	8*2($in_ptr), @acc[2]
+	mov	8*3($in_ptr), @acc[3]
+	mov	8*4($in_ptr), @acc[4]
+	mov	8*5($in_ptr), @acc[5]
+	mov	8*6($in_ptr), @acc[6]
+	mov	8*7($in_ptr), @acc[7]
+	mov	8*8($in_ptr), @acc[8]
+	mov	8*9($in_ptr), @acc[9]
+	mov	8*10($in_ptr), @acc[10]
+	mov	8*11($in_ptr), @acc[11]
+
+	sar	\$63, %rax		# |g0|'s sign as mask
+	xor	$fx, $fx		# overrides in_ptr
+	sub	%rax, $fx		# |g0|'s sign as bit
+
+	xor	%rax, $f0		# conditionally negate |g0|
+	add	$fx, $f0
+
+	xor	%rax, @acc[0]		# conditionally negate |v|
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	%rax, @acc[5]
+	xor	%rax, @acc[6]
+	xor	%rax, @acc[7]
+	xor	%rax, @acc[8]
+	xor	%rax, @acc[9]
+	xor	%rax, @acc[10]
+	xor	%rax, @acc[11]
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+	adc	\$0, @acc[7]
+	adc	\$0, @acc[8]
+	adc	\$0, @acc[9]
+	adc	\$0, @acc[10]
+	adc	\$0, @acc[11]
+
+	mulx	@acc[0], @acc[0], %rax	# |v|*|g0|
+	mulx	@acc[1], @acc[1], $fx
+	add	%rax, @acc[1]
+___
+for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___;
+	mulx	@acc[11], @acc[11], $fx
+	mov	8*1(%rsp), %rdx		# out_ptr
+	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
+	adc	@acc[11], %rax
+
+	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1(%rdx), @acc[1]
+	adc	8*2(%rdx), @acc[2]
+	adc	8*3(%rdx), @acc[3]
+	adc	8*4(%rdx), @acc[4]
+	adc	8*5(%rdx), @acc[5]
+	adc	8*6(%rdx), @acc[6]
+	mov	8*7(%rdx), @acc[11]	# sign extension
+	adc	@acc[11], @acc[7]
+	adc	@acc[11], @acc[8]
+	adc	@acc[11], @acc[9]
+	adc	@acc[11], @acc[10]
+	adc	@acc[11], %rax
+
+	mov	%rdx, $out_ptr		# restore original out_ptr
+
+	mov	@acc[0], 8*0(%rdx)
+	mov	@acc[1], 8*1(%rdx)
+	mov	@acc[2], 8*2(%rdx)
+	mov	@acc[3], 8*3(%rdx)
+	mov	@acc[4], 8*4(%rdx)
+	mov	@acc[5], 8*5(%rdx)
+	mov	@acc[6], 8*6(%rdx)
+	mov	@acc[7], 8*7(%rdx)
+	mov	@acc[8], 8*8(%rdx)
+	mov	@acc[9], 8*9(%rdx)
+	mov	@acc[10], 8*10(%rdx)
+	mov	%rax,     8*11(%rdx)
+
+	ret
+.size	__smulx_767x63,.-__smulx_767x63
+___
+}
+$code.=<<___;
+.type	__smulx_383x63,\@abi-omnipotent
+.align	32
+__smulx_383x63:
+___
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	$f0, $fx
+	sar	\$63, $fx		# |f0|'s sign as mask (or |g0|'s)
+	xor	%rax, %rax
+	sub	$fx, %rax		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	$fx, $f0		# conditionally negate |f0|
+	add	%rax, $f0
+
+	xor	$fx, @acc[0]		# conditionally negate |u| (or |v|)
+	xor	$fx, @acc[1]
+	xor	$fx, @acc[2]
+	xor	$fx, @acc[3]
+	xor	$fx, @acc[4]
+	xor	$fx, @acc[5]
+	add	%rax, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mulx	@acc[0], @acc[0], $fx	# |u|*|f0| (or |v|*|g0|)
+	mulx	@acc[1], @acc[1], %rax
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	mulx	@acc[$i], @acc[$i], %rax
+	mov	$g0, $f0
+	adc	$fx, @acc[$i]
+
+	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+___
+}
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], %rax
+	adc	$fx, @acc[$i]
+
+	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), @acc[5]
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	@acc[5], 8*5($out_ptr)
+
+	ret
+.size	__smulx_383x63,.-__smulx_383x63
+___
+########################################################################
+# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
+# the names refers to maximum bit-lengths of |a| and |b|. As already
+# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
+# chosen so that "bad things" don't happen. For example, so that the
+# sum of the products doesn't overflow, and that the final result is
+# never wider than inputs...
+{
+$code.=<<___;
+.type	__smulx_383_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_383_n_shift_by_31:
+	mov	$f0, @acc[8]
+	xor	@acc[6], @acc[6]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+	mov	$k+8*3($in_ptr), @acc[3]
+	mov	$k+8*4($in_ptr), @acc[4]
+	mov	$k+8*5($in_ptr), @acc[5]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	%rax, @acc[2]
+	xor	%rax, @acc[3]
+	xor	%rax, @acc[4]
+	xor	@acc[5], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[5]
+	add	$fx, @acc[1]
+___
+for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
+$code.=<<___;
+	mulx	@acc[$i], @acc[$i], $a
+	adc	$b, @acc[$i]
+___
+    ($a, $b) = ($b, $a);
+}
+$code.=<<___	if ($j==0);
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	%rdx, @acc[6]
+
+	mov	$g0, %rdx
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+___
+}
+$code.=<<___;
+	adc	\$0, $fx
+	imulq	%rdx
+	add	$fx, %rax
+	adc	\$0, %rdx
+
+	add	8*0($out_ptr), @acc[0]
+	adc	8*1($out_ptr), @acc[1]
+	adc	8*2($out_ptr), @acc[2]
+	adc	8*3($out_ptr), @acc[3]
+	adc	8*4($out_ptr), @acc[4]
+	adc	8*5($out_ptr), %rax
+	adc	%rdx,          @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[1], @acc[0]
+	shrd	\$31, @acc[2], @acc[1]
+	shrd	\$31, @acc[3], @acc[2]
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, %rax,    @acc[4]
+	shrd	\$31, @acc[6], %rax
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[0]	# conditionally negate the result
+	xor	@acc[6], @acc[1]
+	xor	@acc[6], @acc[2]
+	xor	@acc[6], @acc[3]
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, @acc[2]
+	adc	\$0, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, %rax
+
+	mov	@acc[0], 8*0($out_ptr)
+	mov	@acc[1], 8*1($out_ptr)
+	mov	@acc[2], 8*2($out_ptr)
+	mov	@acc[3], 8*3($out_ptr)
+	mov	@acc[4], 8*4($out_ptr)
+	mov	%rax,    8*5($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
+___
+} {
+$code.=<<___;
+.type	__smulx_191_n_shift_by_31,\@abi-omnipotent
+.align	32
+__smulx_191_n_shift_by_31:
+	mov	$f0, @acc[8]
+___
+my $f0 = @acc[8];
+for($j=0; $j<2; $j++) {
+my $k = 8*6*$j;
+my @acc=@acc;
+   @acc=@acc[3..5] if ($j);
+$code.=<<___;
+	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
+	mov	$k+8*1($in_ptr), @acc[1]
+	mov	$k+8*2($in_ptr), @acc[2]
+
+	mov	%rdx, %rax
+	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
+	xor	$fx, $fx
+	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
+
+	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
+	add	$fx, %rdx
+
+	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
+	xor	%rax, @acc[1]
+	xor	@acc[2], %rax
+	add	$fx, @acc[0]
+	adc	\$0, @acc[1]
+	adc	\$0, %rax
+
+	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
+	mulx	@acc[1], @acc[1], @acc[2]
+	add	$fx, @acc[1]
+	adc	\$0, @acc[2]
+	imulq	%rdx
+	add	%rax, @acc[2]
+	adc	\$0, %rdx
+___
+$code.=<<___	if ($j==0);
+	mov	%rdx, @acc[6]
+	mov	$g0, %rdx
+___
+}
+$code.=<<___;
+	add	@acc[0], @acc[3]
+	adc	@acc[1], @acc[4]
+	adc	@acc[2], @acc[5]
+	adc	%rdx,    @acc[6]
+	mov	$f0, %rdx
+
+	shrd	\$31, @acc[4], @acc[3]
+	shrd	\$31, @acc[5], @acc[4]
+	shrd	\$31, @acc[6], @acc[5]
+
+	sar	\$63, @acc[6]		# sign as mask
+	xor	$fx, $fx
+	sub	@acc[6], $fx		# sign as bit
+
+	xor	@acc[6], @acc[3]	# conditionally negate the result
+	xor	@acc[6], @acc[4]
+	xor	@acc[6], @acc[5]
+	add	$fx, @acc[3]
+	adc	\$0, @acc[4]
+	adc	\$0, @acc[5]
+
+	mov	@acc[3], 8*0($out_ptr)
+	mov	@acc[4], 8*1($out_ptr)
+	mov	@acc[5], 8*2($out_ptr)
+
+	xor	@acc[6], %rdx		# conditionally negate |f0|
+	xor	@acc[6], $g0		# conditionally negate |g0|
+	add	$fx, %rdx
+	add	$fx, $g0
+
+	ret
+.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
+___
+} }
+
+{
+my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
+my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
+my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
+my ($a_, $b_) = ($a_lo, $b_lo);
+{
+my @a = ($a_lo, $t1, $a_hi);
+my @b = ($b_lo, $t2, $b_hi);
+
+$code.=<<___;
+.type	__ab_approximation_31,\@abi-omnipotent
+.align	32
+__ab_approximation_31:
+	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
+	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
+	mov	8*4($in_ptr), @a[1]
+	mov	8*10($in_ptr), @b[1]
+	mov	8*3($in_ptr), @a[0]
+	mov	8*9($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# check top-most limbs, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*2($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*8($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... ones before top-most, ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*1($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*7($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	mov	8*0($in_ptr), @a[0]
+	cmovz	@b[0], @b[1]
+	mov	8*6($in_ptr), @b[0]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0		# ... and ones before that ...
+	cmovz	@a[1], @a[2]
+	cmovz	@b[1], @b[2]
+	cmovz	@a[0], @a[1]
+	cmovz	@b[0], @b[1]
+
+	mov	@a[2], $t0
+	or	@b[2], $t0
+	bsr	$t0, %rcx
+	lea	1(%rcx), %rcx
+	cmovz	@a[0], @a[2]
+	cmovz	@b[0], @b[2]
+	cmovz	$t0, %rcx
+	neg	%rcx
+	#and	\$63, %rcx		# debugging artefact
+
+	shldq	%cl, @a[1], @a[2]	# align second limb to the left
+	shldq	%cl, @b[1], @b[2]
+
+	mov	\$0x7FFFFFFF, %eax
+	and	%rax, @a[0]
+	and	%rax, @b[0]
+	andn	@a[2], %rax, @a[2]
+	andn	@b[2], %rax, @b[2]
+	or	@a[2], @a[0]
+	or	@b[2], @b[0]
+
+	jmp	__inner_loop_31
+
+	ret
+.size	__ab_approximation_31,.-__ab_approximation_31
+___
+}
+$code.=<<___;
+.type	__inner_loop_31,\@abi-omnipotent
+.align	32
+__inner_loop_31:		################# by Thomas Pornin
+	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
+	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
+	mov	\$0x7FFFFFFF7FFFFFFF, $bias
+
+.Loop_31:
+	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
+	mov	$a_, $t0
+	mov	$b_, $t1
+	mov	$fg0, $t2
+	mov	$fg1, $t3
+	cmovb	$b_, $a_
+	cmovb	$t0, $b_
+	cmovb	$fg1, $fg0
+	cmovb	$t2, $fg1
+
+	sub	$b_, $a_		# |a_|-|b_|
+	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
+	add	$bias, $fg0
+
+	test	\$1, $t0		# if |a_| was even, roll back 
+	cmovz	$t0, $a_
+	cmovz	$t1, $b_
+	cmovz	$t2, $fg0
+	cmovz	$t3, $fg1
+
+	shr	\$1, $a_		# |a_|>>=1
+	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
+	sub	$bias, $fg1
+	sub	\$1, $cnt
+	jnz	.Loop_31
+
+	shr	\$32, $bias
+	mov	%ecx, %edx		# $fg0, $f0
+	mov	${fg1}d, ${f1}d
+	shr	\$32, $g0
+	shr	\$32, $g1
+	sub	$bias, $f0		# remove the bias
+	sub	$bias, $g0
+	sub	$bias, $f1
+	sub	$bias, $g1
+
+	ret
+.size	__inner_loop_31,.-__inner_loop_31
+
+.type	__inner_loop_62,\@abi-omnipotent
+.align	32
+__inner_loop_62:
+	mov	\$1, $f0	# |f0|=1
+	xor	$g0, $g0	# |g0|=0
+	xor	$f1, $f1	# |f1|=0
+	mov	\$1, $g1	# |g1|=1
+
+.Loop_62:
+	xor	$t0, $t0
+	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
+	mov	$b_lo, $t1
+	cmovnz	$b_lo, $t0
+	sub	$a_lo, $t1	# |b_|-|a_|
+	mov	$a_lo, $t2
+	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
+	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
+	cmovc	$t2, $b_lo	# |b_| = |a_|
+	mov	$f0, $t0	# exchange |f0| and |f1|
+	cmovc	$f1, $f0
+	cmovc	$t0, $f1
+	mov	$g0, $t1	# exchange |g0| and |g1|
+	cmovc	$g1, $g0
+	cmovc	$t1, $g1
+	xor	$t0, $t0
+	xor	$t1, $t1
+	shr	\$1, $a_lo
+	test	\$1, $t2	# if |a_| was odd, then we'll be subtracting...
+	cmovnz	$f1, $t0
+	cmovnz	$g1, $t1
+	add	$f1, $f1	# |f1|<<=1
+	add	$g1, $g1	# |g1|<<=1
+	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
+	sub	\$1, $cnt
+	jnz	.Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/div3w-armv8.pl b/crypto/blst_src/asm/div3w-armv8.pl
new file mode 100755
index 00000000000..bfa32453c3a
--- /dev/null
+++ b/crypto/blst_src/asm/div3w-armv8.pl
@@ -0,0 +1,122 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	 extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	 lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+{
+my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
+my @div = map("x$_",(3..4));
+my @acc = map("x$_",(5..7));
+my @t = map("x$_",(8..11));
+
+$code.=<<___;
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	@div[0],@div[1],[$divisor]
+
+	mul	@acc[0],@div[0],$quot	// divisor[0:1} * quotient
+	umulh	@acc[1],@div[0],$quot
+	mul	@t[3],  @div[1],$quot
+	umulh	@acc[2],@div[1],$quot
+
+	ldp	@t[0],@t[1],[$div_rem]	// load 3 limbs of the dividend
+	ldr	@t[2],[$div_rem,#16]
+
+	adds	@acc[1],@acc[1],@t[3]
+	adc	@acc[2],@acc[2],xzr
+
+	subs	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+	sbcs	@t[1],@t[1],@acc[1]
+	sbcs	@t[2],@t[2],@acc[2]
+	sbc	@acc[0],xzr,xzr		// borrow -> mask
+
+	add	$quot,$quot,@acc[0]	// if borrowed, adjust the quotient ...
+	and	@div[0],@div[0],@acc[0]
+	and	@div[1],@div[1],@acc[0]
+	adds	@t[0],@t[0],@div[0]	// ... and add divisor
+	adc	@t[1],@t[1],@div[1]
+
+	stp	@t[0],@t[1],[$div_rem]	// save 2 limbs of the remainder
+	str	$quot,[$div_rem,#16]	// and one limb of the quotient
+
+	mov	x0,$quot		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	@div[0],[$divisor]
+	ldr	@t[0],[$div_rem]	// load 1 limb of the dividend
+
+	mul	@acc[0],@div[0],$quot	// divisor * quotient
+
+	sub	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
+
+	stp	@t[0],$quot,[$div_rem]	// save remainder and quotient
+
+	mov	x0,$quot		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/div3w-x86_64.pl b/crypto/blst_src/asm/div3w-x86_64.pl
new file mode 100755
index 00000000000..b8192db8e6d
--- /dev/null
+++ b/crypto/blst_src/asm/div3w-x86_64.pl
@@ -0,0 +1,184 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$c_ref=<<'___';
+/*
+ * |div_top| points at two most significant limbs of the dividend, |d_hi|
+ * and |d_lo| are two most significant limbs of the divisor. If divisor
+ * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
+ * The divisor is required to be "bitwise left-aligned," and dividend's
+ * top limbs to be not larger than the divisor's. The latter limitation
+ * can be problematic in the first iteration of multi-precision division,
+ * where in most general case the condition would have to be "smaller."
+ * The subroutine considers four limbs, two of which are "overlapping,"
+ * hence the name... Another way to look at it is to think of the pair
+ * of the dividend's limbs being suffixed with a zero:
+ *   +-------+-------+-------+
+ * R |       |       |   0   |
+ *   +-------+-------+-------+
+ *           +-------+-------+
+ * D         |       |       |
+ *           +-------+-------+
+ */
+limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
+{
+    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
+    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
+    limb_t Q = 0, mask;
+    size_t i;
+
+    for (i = 0; i < LIMB_BITS; i++) {
+        Q <<= 1;
+        mask = (R >= D);
+        Q |= mask;
+        R -= (D & ((llimb_t)0 - mask));
+        D >>= 1;
+    }
+
+    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */
+
+    Q <<= 1;
+    Q |= (R >= D);
+
+    return (Q | mask);
+}
+___
+
+$code.=<<___;
+.text
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,\@function,3
+.align	32
+div_3_limbs:
+	mov	(%rdi),%r8		# load R.lo
+	mov	8(%rdi),%r9		# load R.hi
+	xor	%rax,%rax		# Q = 0
+	mov	\$64,%ecx		# loop counter
+
+.Loop:
+	 mov	%r8,%r10		# put aside R
+	sub	%rsi,%r8		# R -= D
+	 mov	%r9,%r11
+	sbb	%rdx,%r9
+	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
+	 mov	%rdx,%rdi
+	cmovc	%r10,%r8		# restore R if R - D borrowed
+	cmovc	%r11,%r9
+	sbb	\$0,%rax		# subtract speculative bit
+	 shl	\$63,%rdi
+	 shr	\$1,%rsi
+	 shr	\$1,%rdx
+	 or	%rdi,%rsi		# D >>= 1
+	sub	\$1,%ecx
+	jnz	.Loop
+
+	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
+	sar	\$63,%rax		# top bit -> mask
+
+	sub	%rsi,%r8		# R -= D
+	sbb	%rdx,%r9
+	sbb	\$0,%rcx		# subtract speculative bit
+
+	or	%rcx,%rax		# all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+___
+########################################################################
+# Calculate remainder and adjust the quotient, which can be off-by-one.
+# Then save quotient in limb next to top limb of the remainder. There is
+# place, because the remainder/next-iteration-dividend gets shorter by
+# one limb.
+{
+my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
+my @acc = ("%r8", "%r9", "%rdx");
+my @tmp = ("%r10", "%r11", "%rax");
+
+$code.=<<___;
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,\@function,3
+.align	32
+quot_rem_128:
+	mov	%rdx, %rax
+	mov	%rdx, $quotient
+
+	mulq	0($divisor)		# divisor[0:1] * quotient
+	mov	%rax, @acc[0]
+	mov	$quotient, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	8($divisor)
+	add	%rax, @acc[1]
+	adc	\$0, %rdx		# %rdx is @acc[2]
+
+	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
+	mov	8($div_rem), @tmp[1]
+	mov	16($div_rem), @tmp[2]
+
+	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
+	sbb	@acc[1], @tmp[1]
+	sbb	@acc[2], @tmp[2]
+	sbb	@acc[0], @acc[0]	# borrow -> mask
+
+	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
+	mov	@acc[0], @acc[1]
+	and	0($divisor), @acc[0]
+	and	8($divisor), @acc[1]
+	add	@acc[0], @tmp[0]	# ... and add divisor
+	adc	@acc[1], @tmp[1]
+
+	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
+	mov	@tmp[1], 8($div_rem)
+	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient
+
+	mov	$quotient, %rax		# return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+########################################################################
+# Unlike 128-bit case above, quotient is exact. As result just one limb
+# of the dividend is sufficient to calculate the remainder...
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,\@function,3
+.align	32
+quot_rem_64:
+	mov	%rdx, %rax		# return quotient
+	imulq	0($divisor), %rdx	# divisor[0] * quotient
+
+	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend
+
+	sub	%rdx, @tmp[0]		# dividend - divisor * quotient
+
+	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
+	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/mul_mont_256-armv8.pl b/crypto/blst_src/asm/mul_mont_256-armv8.pl
new file mode 100755
index 00000000000..ba6c2b87980
--- /dev/null
+++ b/crypto/blst_src/asm/mul_mont_256-armv8.pl
@@ -0,0 +1,409 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
+
+@mod=map("x$_",(5..8));
+$bi="x9";
+@a=map("x$_",(10..13));
+@tmp=map("x$_",(14..17));
+@acc=map("x$_",(19..24));
+$m0=$n_ptr;
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	mul	@acc[0],@a[0],$bi
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	mul	@acc[1],@a[1],$bi
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],xzr,    @tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	ldr	$bi,[$b_ptr,8*$i]
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adc	@acc[4],xzr,xzr
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$m0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	//mul	@tmp[0],@mod[0],$m0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$m0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$m0
+	 adc	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$m0
+___
+}
+$code.=<<___;
+	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@acc[4],@acc[4],xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	 adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+___
+{
+my @acc = (@a,@acc[0..3]);
+my @a = @mod;
+
+$code.=<<___;
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	paciasp
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mov	$n0,$n_ptr
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	@acc[1],@a[1],@a[0]	// a[1]*a[0]
+	umulh	@tmp[1],@a[1],@a[0]
+	mul	@acc[2],@a[2],@a[0]	// a[2]*a[0]
+	umulh	@tmp[2],@a[2],@a[0]
+	mul	@acc[3],@a[3],@a[0]	// a[3]*a[0]
+	umulh	@acc[4],@a[3],@a[0]
+
+	adds	@acc[2],@acc[2],@tmp[1]	// accumulate high parts of multiplication
+	 mul	@tmp[0],@a[2],@a[1]	// a[2]*a[1]
+	 umulh	@tmp[1],@a[2],@a[1]
+	adcs	@acc[3],@acc[3],@tmp[2]
+	 mul	@tmp[2],@a[3],@a[1]	// a[3]*a[1]
+	 umulh	@tmp[3],@a[3],@a[1]
+	adc	@acc[4],@acc[4],xzr	// can't overflow
+
+	mul	@acc[5],@a[3],@a[2]	// a[3]*a[2]
+	umulh	@acc[6],@a[3],@a[2]
+
+	adds	@tmp[1],@tmp[1],@tmp[2]	// accumulate high parts of multiplication
+	 mul	@acc[0],@a[0],@a[0]	// a[0]*a[0]
+	adc	@tmp[2],@tmp[3],xzr	// can't overflow
+
+	adds	@acc[3],@acc[3],@tmp[0]	// accumulate low parts of multiplication
+	 umulh	@a[0],@a[0],@a[0]
+	adcs	@acc[4],@acc[4],@tmp[1]
+	 mul	@tmp[1],@a[1],@a[1]	// a[1]*a[1]
+	adcs	@acc[5],@acc[5],@tmp[2]
+	 umulh	@a[1],@a[1],@a[1]
+	adc	@acc[6],@acc[6],xzr	// can't overflow
+
+	adds	@acc[1],@acc[1],@acc[1]	// acc[1-6]*=2
+	 mul	@tmp[2],@a[2],@a[2]	// a[2]*a[2]
+	adcs	@acc[2],@acc[2],@acc[2]
+	 umulh	@a[2],@a[2],@a[2]
+	adcs	@acc[3],@acc[3],@acc[3]
+	 mul	@tmp[3],@a[3],@a[3]	// a[3]*a[3]
+	adcs	@acc[4],@acc[4],@acc[4]
+	 umulh	@a[3],@a[3],@a[3]
+	adcs	@acc[5],@acc[5],@acc[5]
+	adcs	@acc[6],@acc[6],@acc[6]
+	adc	@acc[7],xzr,xzr
+
+	adds	@acc[1],@acc[1],@a[0]	// +a[i]*a[i]
+	adcs	@acc[2],@acc[2],@tmp[1]
+	adcs	@acc[3],@acc[3],@a[1]
+	adcs	@acc[4],@acc[4],@tmp[2]
+	adcs	@acc[5],@acc[5],@a[2]
+	adcs	@acc[6],@acc[6],@tmp[3]
+	adc	@acc[7],@acc[7],@a[3]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	@acc[0],@acc[0],@acc[4]	// accumulate upper half
+	adcs	@acc[1],@acc[1],@acc[5]
+	adcs	@acc[2],@acc[2],@acc[6]
+	adcs	@acc[3],@acc[3],@acc[7]
+	adc	@acc[4],xzr,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	xzr,    @acc[4],xzr
+
+	csel	@acc[0],@acc[0],@tmp[0],lo
+	csel	@acc[1],@acc[1],@tmp[1],lo
+	csel	@acc[2],@acc[2],@tmp[2],lo
+	csel	@acc[3],@acc[3],@tmp[3],lo
+
+	stp	@acc[0],@acc[1],[$r_ptr]
+	stp	@acc[2],@acc[3],[$r_ptr,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	autiasp
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+}
+{
+my @a = (@a, $bi);
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	paciasp
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	$n0,$n_ptr
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	@tmp[0],@tmp[1],[$a_ptr,#32]
+	ldp	@tmp[2],@tmp[3],[$a_ptr,#48]
+
+	adds	@a[0],@a[0],@tmp[0]
+	adcs	@a[1],@a[1],@tmp[1]
+	adcs	@a[2],@a[2],@tmp[2]
+	adcs	@a[3],@a[3],@tmp[3]
+	adc	@a[4],xzr,xzr
+
+	subs	@tmp[0],@a[0],@mod[0]
+	sbcs	@tmp[1],@a[1],@mod[1]
+	sbcs	@tmp[2],@a[2],@mod[2]
+	sbcs	@tmp[3],@a[3],@mod[3]
+	sbcs	xzr,    @a[4],xzr
+
+	csel	@a[0],@a[0],@tmp[0],lo
+	csel	@a[1],@a[1],@tmp[1],lo
+	csel	@a[2],@a[2],@tmp[2],lo
+	csel	@a[3],@a[3],@tmp[3],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+
+	ldr	x29,[sp],#16
+	autiasp
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	$m0,$n0,@a[0]
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+___
+for ($i=1;$i<4;$i++) {
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	mul	$m0,$n0,@a[0]
+	 adc	@a[3],@a[4],@tmp[3]
+___
+}
+$code.=<<___;
+	//mul	@tmp[0],@mod[0],$m0
+	mul	@tmp[1],@mod[1],$m0
+	mul	@tmp[2],@mod[2],$m0
+	mul	@tmp[3],@mod[3],$m0
+	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$m0
+	adcs	@a[1],@a[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$m0
+	adcs	@a[2],@a[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$m0
+	adcs	@a[3],@a[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$m0
+	adc	@a[4],xzr,xzr
+
+	 adds	@a[0],@a[1],@tmp[0]
+	 adcs	@a[1],@a[2],@tmp[1]
+	 adcs	@a[2],@a[3],@tmp[2]
+	 adc	@a[3],@a[4],@tmp[3]
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
+___
+}
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/mul_mont_384-armv8.pl b/crypto/blst_src/asm/mul_mont_384-armv8.pl
new file mode 100755
index 00000000000..44e12a00b03
--- /dev/null
+++ b/crypto/blst_src/asm/mul_mont_384-armv8.pl
@@ -0,0 +1,2015 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
+
+@mod = map("x$_",(5..10));
+@a   = map("x$_",(11..16));
+$bi  = "x17";
+@acc = map("x$_",(19..25));
+@tmp = map("x$_",(26..28,0,1,3));
+
+$code.=<<___;
+.text
+
+.globl	add_mod_384x384
+.type	add_mod_384x384,%function
+.align	5
+add_mod_384x384:
+	paciasp
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	autiasp
+	ret
+.size	add_mod_384x384,.-add_mod_384x384
+
+.type	__add_mod_384x384,%function
+.align	5
+__add_mod_384x384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	adds	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	adcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	adcs	@a[3],@a[3],@acc[3]
+	 stp	@a[0],  @a[1],  [$r_ptr]
+	adcs	@a[4],@a[4],@acc[4]
+	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
+	adcs	@a[5],@a[5],@acc[5]
+
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	 stp	@a[2],  @a[3],  [$r_ptr,#16]
+	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+
+	adcs	@a[0],@a[0],@acc[0]
+	 stp	@a[4],  @a[5],  [$r_ptr,#32]
+	adcs	@a[1],@a[1],@acc[1]
+	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
+	adcs	@a[2],@a[2],@acc[2]
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	$bi,xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,$bi,xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	csel	@a[4],@a[4],@acc[4],lo
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	csel	@a[5],@a[5],@acc[5],lo
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.globl	sub_mod_384x384
+.type	sub_mod_384x384,%function
+.align	5
+sub_mod_384x384:
+	paciasp
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	autiasp
+	ret
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.type	__sub_mod_384x384,%function
+.align	5
+__sub_mod_384x384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	subs	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	sbcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	sbcs	@a[3],@a[3],@acc[3]
+	 stp	@a[0],  @a[1],  [$r_ptr]
+	sbcs	@a[4],@a[4],@acc[4]
+	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
+	sbcs	@a[5],@a[5],@acc[5]
+
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	 stp	@a[2],  @a[3],  [$r_ptr,#16]
+	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+
+	sbcs	@a[0],@a[0],@acc[0]
+	 stp	@a[4],  @a[5],  [$r_ptr,#32]
+	sbcs	@a[1],@a[1],@acc[1]
+	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
+	sbcs	@a[2],@a[2],@acc[2]
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	$bi,xzr,xzr
+
+	 and	@acc[0],@mod[0],$bi
+	 and	@acc[1],@mod[1],$bi
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],$bi
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],$bi
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],$bi
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],$bi
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	adds	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	adcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	adcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	$bi,xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,$bi,xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	stp	@a[0],@a[1],[$r_ptr]
+	csel	@a[5],@a[5],@acc[5],lo
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	@a[0],  @a[1],  [$a_ptr]
+	ldp	@acc[0],@acc[1],[$b_ptr]
+	ldp	@a[2],  @a[3],  [$a_ptr,#16]
+	subs	@a[0],@a[0],@acc[0]
+	ldp	@acc[2],@acc[3],[$b_ptr,#16]
+	sbcs	@a[1],@a[1],@acc[1]
+	ldp	@a[4],  @a[5],  [$a_ptr,#32]
+	sbcs	@a[2],@a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$b_ptr,#32]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	$bi,xzr,xzr
+
+	 and	@acc[0],@mod[0],$bi
+	 and	@acc[1],@mod[1],$bi
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],$bi
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],$bi
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],$bi
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],$bi
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,%function
+.align	5
+mul_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	@tmp[0],$r_ptr		// save r_ptr
+	mov	@tmp[1],$a_ptr		// save b_ptr
+	mov	@tmp[2],$b_ptr		// save b_ptr
+
+	sub	$r_ptr,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	$a_ptr,$a_ptr,#48	// mul_384(t1, a->im, b->im)
+	add	$b_ptr,$b_ptr,#48
+	add	$r_ptr,sp,#96
+	bl	__mul_384
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	sub	$b_ptr,$a_ptr,#48
+	add	$r_ptr,sp,#240
+	bl	__add_mod_384
+
+	add	$a_ptr,@tmp[2],#0
+	add	$b_ptr,@tmp[2],#48
+	add	$r_ptr,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	$a_ptr,$r_ptr,#0
+	add	$b_ptr,$r_ptr,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	mov	$a_ptr,$r_ptr
+	add	$b_ptr,sp,#0
+	bl	__sub_mod_384x384
+
+	add	$b_ptr,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	$a_ptr,sp,#0
+	add	$b_ptr,sp,#96
+	add	$r_ptr,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	$a_ptr,sp,#0		// ret->re = redc(t0)
+	add	$r_ptr,@tmp[0],#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	$a_ptr,sp,#192		// ret->im = redc(t2)
+	add	$r_ptr,$r_ptr,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_mont_384x,.-mul_mont_384x
+
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,%function
+.align	5
+sqr_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	add	$b_ptr,$a_ptr,#48
+	add	$r_ptr,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	$r_ptr,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	@a[0],@a[0],@a[0]	// add with itself
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	@acc[6],xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,@acc[6],xzr
+
+	csel	@acc[0],@a[0],@acc[0],lo
+	csel	@acc[1],@a[1],@acc[1],lo
+	csel	@acc[2],@a[2],@acc[2],lo
+	 ldp	@a[0],@a[1],[sp]
+	csel	@acc[3],@a[3],@acc[3],lo
+	 ldr	$bi,        [sp,#48]
+	csel	@acc[4],@a[4],@acc[4],lo
+	 ldp	@a[2],@a[3],[sp,#16]
+	csel	@acc[5],@a[5],@acc[5],lo
+	 ldp	@a[4],@a[5],[sp,#32]
+
+	stp	@acc[0],@acc[1],[$b_ptr,#48]
+	stp	@acc[2],@acc[3],[$b_ptr,#64]
+	stp	@acc[4],@acc[5],[$b_ptr,#80]
+
+	add	$b_ptr,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,%function
+.align	5
+mul_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_mont_384,.-mul_mont_384
+
+.type	__mul_mont_384,%function
+.align	5
+__mul_mont_384:
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+	mul	$n0,$n0,@acc[0]
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 umulh	@tmp[4],@a[4],$bi
+	 umulh	@tmp[5],@a[5],$bi
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	// mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],xzr,    @tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+	 mov	$bi,xzr
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adcs	@acc[6],@acc[6],xzr
+	adc	$n0,$bi,xzr
+	ldr	$bi,[$b_ptr,8*$i]
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	mul	@tmp[4],@a[4],$bi
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	mul	@tmp[5],@a[5],$bi
+	 adc	@acc[6],$n0,xzr
+	ldr	$n0,[x29,#96]
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$n0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@a[5],$bi
+	adcs	@acc[6],@acc[6],xzr
+	adc	$bi,xzr,xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	// mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adcs	@acc[6],@acc[6],@tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+	 adc	$bi,$bi,xzr
+___
+}
+$code.=<<___;
+	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adcs	@acc[6],@acc[6],xzr
+	 ldp	$n0,$b_ptr,[x29,#96]	// pull r_ptr
+	adc	$bi,$bi,xzr
+
+	 adds	@acc[0],@acc[1],@tmp[0]
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	 adc	@acc[6],$bi,xzr
+
+	subs	@tmp[0],@acc[0],@mod[0]
+	sbcs	@tmp[1],@acc[1],@mod[1]
+	sbcs	@tmp[2],@acc[2],@mod[2]
+	sbcs	@tmp[3],@acc[3],@mod[3]
+	sbcs	@tmp[4],@acc[4],@mod[4]
+	sbcs	@tmp[5],@acc[5],@mod[5]
+	sbcs	xzr,    @acc[6],xzr
+
+	csel	@a[0],@acc[0],@tmp[0],lo
+	csel	@a[1],@acc[1],@tmp[1],lo
+	csel	@a[2],@acc[2],@tmp[2],lo
+	csel	@a[3],@acc[3],@tmp[3],lo
+	csel	@a[4],@acc[4],@tmp[4],lo
+	csel	@a[5],@acc[5],@tmp[5],lo
+	ret
+.size	__mul_mont_384,.-__mul_mont_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,%function
+.align	5
+sqr_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	mov	$n_ptr,$r_ptr		// save r_ptr
+	mov	$r_ptr,sp
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__sqr_384
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	mov	$a_ptr,sp
+	mov	$r_ptr,$n_ptr		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_384,.-sqr_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,%function
+.align	5
+sqr_n_mul_mont_383:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	$bi,x5			// save b_ptr
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	mov	$r_ptr,sp
+.Loop_sqr_383:
+	bl	__sqr_384
+	sub	$b_ptr,$b_ptr,#1	// counter
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	mov	$a_ptr,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@a[0],@a[0],@acc[0]	// just accumulate upper half
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adc	@a[5],@a[5],@acc[5]
+
+	cbnz	$b_ptr,.Loop_sqr_383
+
+	mov	$b_ptr,$bi
+	ldr	$bi,[$bi]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+___
+{
+my @acc=(@acc,@tmp[0..2]);
+
+$code.=<<___;
+.type	__sqr_384,%function
+.align	5
+__sqr_384:
+	mul	@acc[0],@a[1],@a[0]
+	mul	@acc[1],@a[2],@a[0]
+	mul	@acc[2],@a[3],@a[0]
+	mul	@acc[3],@a[4],@a[0]
+	mul	@acc[4],@a[5],@a[0]
+
+	 umulh	@mod[1],@a[1],@a[0]
+	 umulh	@mod[2],@a[2],@a[0]
+	 umulh	@mod[3],@a[3],@a[0]
+	 umulh	@mod[4],@a[4],@a[0]
+	 adds	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[5],@a[5],@a[0]
+	 adcs	@acc[2],@acc[2],@mod[2]
+	mul	@mod[2],@a[2],@a[1]
+	 adcs	@acc[3],@acc[3],@mod[3]
+	mul	@mod[3],@a[3],@a[1]
+	 adcs	@acc[4],@acc[4],@mod[4]
+	mul	@mod[4],@a[4],@a[1]
+	 adc	@acc[5],xzr,    @mod[5]
+	mul	@mod[5],@a[5],@a[1]
+
+	adds	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],@a[1]
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],@a[1]
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],@a[1]
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],@a[1]
+	adc	@acc[6],xzr,xzr
+
+	  mul	@mod[0],@a[0],@a[0]
+	 adds	@acc[3],@acc[3],@mod[2]
+	  umulh	@a[0],  @a[0],@a[0]
+	 adcs	@acc[4],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],@a[2]
+	 adcs	@acc[5],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],@a[2]
+	 adc	@acc[6],@acc[6],@mod[5]
+	mul	@mod[5],@a[5],@a[2]
+
+	adds	@acc[4],@acc[4],@mod[3]
+	 umulh	@mod[3],@a[3],@a[2]
+	adcs	@acc[5],@acc[5],@mod[4]
+	 umulh	@mod[4],@a[4],@a[2]
+	adcs	@acc[6],@acc[6],@mod[5]
+	 umulh	@mod[5],@a[5],@a[2]
+	adc	@acc[7],xzr,xzr
+
+	  mul	@mod[1],@a[1],@a[1]
+	 adds	@acc[5],@acc[5],@mod[3]
+	  umulh	@a[1],  @a[1],@a[1]
+	 adcs	@acc[6],@acc[6],@mod[4]
+	mul	@mod[4],@a[4],@a[3]
+	 adc	@acc[7],@acc[7],@mod[5]
+	mul	@mod[5],@a[5],@a[3]
+
+	adds	@acc[6],@acc[6],@mod[4]
+	 umulh	@mod[4],@a[4],@a[3]
+	adcs	@acc[7],@acc[7],@mod[5]
+	 umulh	@mod[5],@a[5],@a[3]
+	adc	@acc[8],xzr,xzr
+	  mul	@mod[2],@a[2],@a[2]
+	 adds	@acc[7],@acc[7],@mod[4]
+	  umulh	@a[2],  @a[2],@a[2]
+	 adc	@acc[8],@acc[8],@mod[5]
+	  mul	@mod[3],@a[3],@a[3]
+
+	mul	@mod[5],@a[5],@a[4]
+	  umulh	@a[3],  @a[3],@a[3]
+	adds	@acc[8],@acc[8],@mod[5]
+	 umulh	@mod[5],@a[5],@a[4]
+	  mul	@mod[4],@a[4],@a[4]
+	adc	@acc[9],@mod[5],xzr
+
+	adds	@acc[0],@acc[0],@acc[0]
+	adcs	@acc[1],@acc[1],@acc[1]
+	adcs	@acc[2],@acc[2],@acc[2]
+	adcs	@acc[3],@acc[3],@acc[3]
+	adcs	@acc[4],@acc[4],@acc[4]
+	adcs	@acc[5],@acc[5],@acc[5]
+	adcs	@acc[6],@acc[6],@acc[6]
+	adcs	@acc[7],@acc[7],@acc[7]
+	  umulh	@a[4],  @a[4],@a[4]
+	adcs	@acc[8],@acc[8],@acc[8]
+	  mul	@mod[5],@a[5],@a[5]
+	adcs	@acc[9],@acc[9],@acc[9]
+	  umulh	@a[5],  @a[5],@a[5]
+	adc	$a_ptr,xzr,xzr
+
+	adds	@acc[0],@acc[0],@a[0]
+	adcs	@acc[1],@acc[1],@mod[1]
+	adcs	@acc[2],@acc[2],@a[1]
+	adcs	@acc[3],@acc[3],@mod[2]
+	adcs	@acc[4],@acc[4],@a[2]
+	adcs	@acc[5],@acc[5],@mod[3]
+	adcs	@acc[6],@acc[6],@a[3]
+	stp	@mod[0],@acc[0],[$r_ptr]
+	adcs	@acc[7],@acc[7],@mod[4]
+	stp	@acc[1],@acc[2],[$r_ptr,#16]
+	adcs	@acc[8],@acc[8],@a[4]
+	stp	@acc[3],@acc[4],[$r_ptr,#32]
+	adcs	@acc[9],@acc[9],@mod[5]
+	stp	@acc[5],@acc[6],[$r_ptr,#48]
+	adc	@a[5],@a[5],$a_ptr
+	stp	@acc[7],@acc[8],[$r_ptr,#64]
+	stp	@acc[9],@a[5],[$r_ptr,#80]
+
+	ret
+.size	__sqr_384,.-__sqr_384
+___
+}
+$code.=<<___;
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,%function
+.align	5
+sqr_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_384,.-sqr_384
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,%function
+.align	5
+redc_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	redc_mont_384,.-redc_mont_384
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,%function
+.align	5
+from_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	csel	@a[5],@a[5],@acc[5],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	from_mont_384,.-from_mont_384
+
+.type	__mul_by_1_mont_384,%function
+.align	5
+__mul_by_1_mont_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mul	@tmp[0],$n0,@a[0]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	// mul	@acc[0],@mod[0],@tmp[0]
+	mul	@acc[1],@mod[1],@tmp[0]
+	mul	@acc[2],@mod[2],@tmp[0]
+	mul	@acc[3],@mod[3],@tmp[0]
+	mul	@acc[4],@mod[4],@tmp[0]
+	mul	@acc[5],@mod[5],@tmp[0]
+	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
+	 umulh	@a[0],@mod[0],@tmp[0]
+	adcs	@acc[1],@acc[1],@a[1]
+	 umulh	@a[1],@mod[1],@tmp[0]
+	adcs	@acc[2],@acc[2],@a[2]
+	 umulh	@a[2],@mod[2],@tmp[0]
+	adcs	@acc[3],@acc[3],@a[3]
+	 umulh	@a[3],@mod[3],@tmp[0]
+	adcs	@acc[4],@acc[4],@a[4]
+	 umulh	@a[4],@mod[4],@tmp[0]
+	adcs	@acc[5],@acc[5],@a[5]
+	 umulh	@a[5],@mod[5],@tmp[0]
+	adc	@acc[6],xzr,xzr
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	 adds	@a[0],@a[0],@acc[1]
+	 adcs	@a[1],@a[1],@acc[2]
+	 adcs	@a[2],@a[2],@acc[3]
+	mul	@tmp[0],$n0,@a[0]
+	 adcs	@a[3],@a[3],@acc[4]
+	 adcs	@a[4],@a[4],@acc[5]
+	 adc	@a[5],@a[5],@acc[6]
+
+	// mul	@acc[0],@mod[0],@tmp[0]
+	mul	@acc[1],@mod[1],@tmp[0]
+	mul	@acc[2],@mod[2],@tmp[0]
+	mul	@acc[3],@mod[3],@tmp[0]
+	mul	@acc[4],@mod[4],@tmp[0]
+	mul	@acc[5],@mod[5],@tmp[0]
+	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
+	 umulh	@a[0],@mod[0],@tmp[0]
+	adcs	@acc[1],@acc[1],@a[1]
+	 umulh	@a[1],@mod[1],@tmp[0]
+	adcs	@acc[2],@acc[2],@a[2]
+	 umulh	@a[2],@mod[2],@tmp[0]
+	adcs	@acc[3],@acc[3],@a[3]
+	 umulh	@a[3],@mod[3],@tmp[0]
+	adcs	@acc[4],@acc[4],@a[4]
+	 umulh	@a[4],@mod[4],@tmp[0]
+	adcs	@acc[5],@acc[5],@a[5]
+	 umulh	@a[5],@mod[5],@tmp[0]
+	adc	@acc[6],xzr,xzr
+___
+}
+$code.=<<___;
+	adds	@a[0],@a[0],@acc[1]
+	adcs	@a[1],@a[1],@acc[2]
+	adcs	@a[2],@a[2],@acc[3]
+	adcs	@a[3],@a[3],@acc[4]
+	adcs	@a[4],@a[4],@acc[5]
+	adc	@a[5],@a[5],@acc[6]
+
+	ret
+.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384
+
+.type	__redc_tail_mont_384,%function
+.align	5
+__redc_tail_mont_384:
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@a[0],@a[0],@acc[0]	// accumulate upper half
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adcs	@a[5],@a[5],@acc[5]
+	adc	@acc[6],xzr,xzr
+
+	subs	@acc[0],@a[0],@mod[0]
+	sbcs	@acc[1],@a[1],@mod[1]
+	sbcs	@acc[2],@a[2],@mod[2]
+	sbcs	@acc[3],@a[3],@mod[3]
+	sbcs	@acc[4],@a[4],@mod[4]
+	sbcs	@acc[5],@a[5],@mod[5]
+	sbcs	xzr,@acc[6],xzr
+
+	csel	@a[0],@a[0],@acc[0],lo
+	csel	@a[1],@a[1],@acc[1],lo
+	csel	@a[2],@a[2],@acc[2],lo
+	csel	@a[3],@a[3],@acc[3],lo
+	csel	@a[4],@a[4],@acc[4],lo
+	csel	@a[5],@a[5],@acc[5],lo
+
+	stp	@a[0],@a[1],[$r_ptr]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,%function
+.align	5
+mul_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_384,.-mul_384
+
+.type	__mul_384,%function
+.align	5
+__mul_384:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldr	$bi,        [$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+
+	 umulh	@mod[0],@a[0],$bi
+	 umulh	@mod[1],@a[1],$bi
+	 umulh	@mod[2],@a[2],$bi
+	 umulh	@mod[3],@a[3],$bi
+	 umulh	@mod[4],@a[4],$bi
+	 umulh	@mod[5],@a[5],$bi
+	ldr	$bi,[$b_ptr,8*1]
+
+	str	@acc[0],[$r_ptr]
+	 adds	@acc[0],@acc[1],@mod[0]
+	mul	@mod[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@mod[1]
+	mul	@mod[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@mod[2]
+	mul	@mod[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],$bi
+	 adc	@acc[5],xzr,    @mod[5]
+	mul	@mod[5],@a[5],$bi
+___
+for ($i=1;$i<5;$i++) {
+$code.=<<___;
+	adds	@acc[0],@acc[0],@mod[0]
+	 umulh	@mod[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],$bi
+	ldr	$bi,[$b_ptr,#8*($i+1)]
+	adc	@acc[6],xzr,xzr
+
+	str	@acc[0],[$r_ptr,8*$i]
+	 adds	@acc[0],@acc[1],@mod[0]
+	mul	@mod[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@mod[1]
+	mul	@mod[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@mod[2]
+	mul	@mod[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@mod[3]
+	mul	@mod[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@mod[4]
+	mul	@mod[4],@a[4],$bi
+	 adc	@acc[5],@acc[6],@mod[5]
+	mul	@mod[5],@a[5],$bi
+___
+}
+$code.=<<___;
+	adds	@acc[0],@acc[0],@mod[0]
+	 umulh	@mod[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@mod[1]
+	 umulh	@mod[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@mod[2]
+	 umulh	@mod[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@mod[3]
+	 umulh	@mod[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@mod[4]
+	 umulh	@mod[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@mod[5]
+	 umulh	@mod[5],@a[5],$bi
+	adc	@acc[6],xzr,xzr
+
+	str	@acc[0],[$r_ptr,8*$i]
+	 adds	@acc[0],@acc[1],@mod[0]
+	 adcs	@acc[1],@acc[2],@mod[1]
+	 adcs	@acc[2],@acc[3],@mod[2]
+	 adcs	@acc[3],@acc[4],@mod[3]
+	 adcs	@acc[4],@acc[5],@mod[4]
+	 adc	@acc[5],@acc[6],@mod[5]
+
+	stp	@acc[0],@acc[1],[$r_ptr,#48]
+	stp	@acc[2],@acc[3],[$r_ptr,#64]
+	stp	@acc[4],@acc[5],[$r_ptr,#80]
+
+	ret
+.size	__mul_384,.-__mul_384
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,%function
+.align	5
+mul_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	mov	@tmp[0],$r_ptr		// save r_ptr
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	mov	@tmp[1],$a_ptr		// save a_ptr
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	mov	@tmp[2],$b_ptr		// save b_ptr
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+	adcs	@mod[1],$a[1],@acc[1]
+	 ldp	@a[0],@a[1],[$b_ptr]
+	adcs	@mod[2],$a[2],@acc[2]
+	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
+	adcs	@mod[3],$a[3],@acc[3]
+	 ldp	@a[2],@a[3],[$b_ptr,#16]
+	adcs	@mod[4],$a[4],@acc[4]
+	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
+	adc	@mod[5],$a[5],@acc[5]
+	 ldp	@a[4],@a[5],[$b_ptr,#32]
+
+	stp	@mod[0],@mod[1],[sp]
+	 adds	@mod[0],$a[0],@acc[0]	// t1 = b->re + b->im
+	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
+	 adcs	@mod[1],$a[1],@acc[1]
+	stp	@mod[2],@mod[3],[sp,#16]
+	 adcs	@mod[2],$a[2],@acc[2]
+	 adcs	@mod[3],$a[3],@acc[3]
+	 stp	@mod[4],@mod[5],[sp,#32]
+	 adcs	@mod[4],$a[4],@acc[4]
+	 stp	@mod[0],@mod[1],[sp,#48]
+	 adc	@mod[5],$a[5],@acc[5]
+	 stp	@mod[2],@mod[3],[sp,#64]
+	 stp	@mod[4],@mod[5],[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	$a_ptr,sp,#0		// mul_384(ret->im, t0, t1)
+	add	$b_ptr,sp,#48
+	add	$r_ptr,@tmp[0],#96
+	bl	__mul_384
+
+	add	$a_ptr,@tmp[1],#48	// mul_384(tx, a->im, b->im)
+	add	$b_ptr,@tmp[2],#48
+	add	$r_ptr,sp,#0
+	bl	__mul_384
+
+	ldp	@mod[0],@mod[1],[$n_ptr]
+	ldp	@mod[2],@mod[3],[$n_ptr,#16]
+	ldp	@mod[4],@mod[5],[$n_ptr,#32]
+
+	add	$a_ptr,@tmp[0],#96	// ret->im -= tx
+	add	$b_ptr,sp,#0
+	add	$r_ptr,@tmp[0],#96
+	bl	__sub_mod_384x384
+
+	add	$b_ptr,@tmp[0],#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	$a_ptr,@tmp[0],#0	// ret->re -= tx
+	add	$b_ptr,sp,#0
+	add	$r_ptr,@tmp[0],#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	mul_382x,.-mul_382x
+
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,%function
+.align	5
+sqr_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@acc[0],@acc[1],[$a_ptr,#48]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	adcs	@mod[1],$a[1],@acc[1]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	adcs	@mod[2],$a[2],@acc[2]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+	adcs	@mod[3],$a[3],@acc[3]
+	stp	@mod[0],@mod[1],[$r_ptr]
+	adcs	@mod[4],$a[4],@acc[4]
+	 ldp	@mod[0],@mod[1],[$b_ptr]
+	adc	@mod[5],$a[5],@acc[5]
+	stp	@mod[2],@mod[3],[$r_ptr,#16]
+
+	subs	@a[0],$a[0],@acc[0]	// t1 = a->re - a->im
+	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	sbcs	@a[1],$a[1],@acc[1]
+	stp	@mod[4],@mod[5],[$r_ptr,#32]
+	sbcs	@a[2],$a[2],@acc[2]
+	 ldp	@mod[4],@mod[5],[$b_ptr,#32]
+	sbcs	@a[3],$a[3],@acc[3]
+	sbcs	@a[4],$a[4],@acc[4]
+	sbcs	@a[5],$a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr
+
+	 and	@acc[0],@mod[0],@acc[6]
+	 and	@acc[1],@mod[1],@acc[6]
+	adds	@a[0],@a[0],@acc[0]
+	 and	@acc[2],@mod[2],@acc[6]
+	adcs	@a[1],@a[1],@acc[1]
+	 and	@acc[3],@mod[3],@acc[6]
+	adcs	@a[2],@a[2],@acc[2]
+	 and	@acc[4],@mod[4],@acc[6]
+	adcs	@a[3],@a[3],@acc[3]
+	 and	@acc[5],@mod[5],@acc[6]
+	adcs	@a[4],@a[4],@acc[4]
+	stp	@a[0],@a[1],[$r_ptr,#48]
+	adc	@a[5],@a[5],@acc[5]
+	stp	@a[2],@a[3],[$r_ptr,#64]
+	stp	@a[4],@a[5],[$r_ptr,#80]
+
+	mov	$n0,$a_ptr		// save a_ptr
+	add	$a_ptr,$r_ptr,#0	// mul_384(ret->re, t0, t1)
+	add	$b_ptr,$r_ptr,#48
+	bl	__mul_384
+
+	add	$a_ptr,$n0,#0		// mul_384(ret->im, a->re, a->im)
+	add	$b_ptr,$n0,#48
+	add	$r_ptr,$r_ptr,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	@a[0],@a[1],[$r_ptr]
+	ldp	@a[2],@a[3],[$r_ptr,#16]
+	adds	@a[0],@a[0],@a[0]	// add with itself
+	ldp	@a[4],@a[5],[$r_ptr,#32]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adcs	@acc[0],@acc[0],@acc[0]
+	adcs	@acc[1],@acc[1],@acc[1]
+	stp	@a[0],@a[1],[$r_ptr]
+	adcs	@acc[2],@acc[2],@acc[2]
+	stp	@a[2],@a[3],[$r_ptr,#16]
+	adcs	@acc[3],@acc[3],@acc[3]
+	stp	@a[4],@a[5],[$r_ptr,#32]
+	adcs	@acc[4],@acc[4],@acc[4]
+	stp	@acc[0],@acc[1],[$r_ptr,#48]
+	adc	@acc[5],@acc[5],@acc[5]
+	stp	@acc[2],@acc[3],[$r_ptr,#64]
+	stp	@acc[4],@acc[5],[$r_ptr,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_382x,.-sqr_382x
+
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,%function
+.align	5
+sqr_mont_382x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	$n0,$n_ptr		// adjust for missing b_ptr
+
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+
+	ldp	$bi,@acc[1],[$a_ptr,#48]
+	ldp	@acc[2],@acc[3],[$a_ptr,#64]
+	ldp	@acc[4],@acc[5],[$a_ptr,#80]
+
+	adds	@mod[0],$a[0],$bi	// t0 = a->re + a->im
+	adcs	@mod[1],$a[1],@acc[1]
+	adcs	@mod[2],$a[2],@acc[2]
+	adcs	@mod[3],$a[3],@acc[3]
+	adcs	@mod[4],$a[4],@acc[4]
+	adc	@mod[5],$a[5],@acc[5]
+
+	subs	@acc[0],$a[0],$bi	// t1 = a->re - a->im
+	sbcs	@acc[1],$a[1],@acc[1]
+	sbcs	@acc[2],$a[2],@acc[2]
+	sbcs	@acc[3],$a[3],@acc[3]
+	sbcs	@acc[4],$a[4],@acc[4]
+	sbcs	@acc[5],$a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr		// borrow flag as mask
+
+	stp	@mod[0],@mod[1],[sp]
+	stp	@mod[2],@mod[3],[sp,#16]
+	stp	@mod[4],@mod[5],[sp,#32]
+	stp	@acc[0],@acc[1],[sp,#48]
+	stp	@acc[2],@acc[3],[sp,#64]
+	stp	@acc[4],@acc[5],[sp,#80]
+	str	@acc[6],[sp,#96]
+
+	ldp	@mod[0],@mod[1],[$b_ptr]
+	ldp	@mod[2],@mod[3],[$b_ptr,#16]
+	ldp	@mod[4],@mod[5],[$b_ptr,#32]
+
+	add	$b_ptr,$a_ptr,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	@acc[0],@a[0],@a[0]	// add with itself
+	adcs	@acc[1],@a[1],@a[1]
+	adcs	@acc[2],@a[2],@a[2]
+	adcs	@acc[3],@a[3],@a[3]
+	adcs	@acc[4],@a[4],@a[4]
+	adc	@acc[5],@a[5],@a[5]
+
+	stp	@acc[0],@acc[1],[$b_ptr,#48]
+	stp	@acc[2],@acc[3],[$b_ptr,#64]
+	stp	@acc[4],@acc[5],[$b_ptr,#80]
+
+	ldp	@a[0],@a[1],[sp]
+	ldr	$bi,[sp,#48]
+	ldp	@a[2],@a[3],[sp,#16]
+	ldp	@a[4],@a[5],[sp,#32]
+
+	add	$b_ptr,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	@acc[6],[sp,#96]	// account for sign from a->re - a->im
+	ldp	@acc[0],@acc[1],[sp]
+	ldp	@acc[2],@acc[3],[sp,#16]
+	ldp	@acc[4],@acc[5],[sp,#32]
+
+	and	@acc[0],@acc[0],@acc[6]
+	and	@acc[1],@acc[1],@acc[6]
+	and	@acc[2],@acc[2],@acc[6]
+	and	@acc[3],@acc[3],@acc[6]
+	and	@acc[4],@acc[4],@acc[6]
+	and	@acc[5],@acc[5],@acc[6]
+
+	subs	@a[0],@a[0],@acc[0]
+	sbcs	@a[1],@a[1],@acc[1]
+	sbcs	@a[2],@a[2],@acc[2]
+	sbcs	@a[3],@a[3],@acc[3]
+	sbcs	@a[4],@a[4],@acc[4]
+	sbcs	@a[5],@a[5],@acc[5]
+	sbc	@acc[6],xzr,xzr
+
+	and	@acc[0],@mod[0],@acc[6]
+	and	@acc[1],@mod[1],@acc[6]
+	and	@acc[2],@mod[2],@acc[6]
+	and	@acc[3],@mod[3],@acc[6]
+	and	@acc[4],@mod[4],@acc[6]
+	and	@acc[5],@mod[5],@acc[6]
+
+	adds	@a[0],@a[0],@acc[0]
+	adcs	@a[1],@a[1],@acc[1]
+	adcs	@a[2],@a[2],@acc[2]
+	adcs	@a[3],@a[3],@acc[3]
+	adcs	@a[4],@a[4],@acc[4]
+	adc	@a[5],@a[5],@acc[5]
+
+	stp	@a[0],@a[1],[$b_ptr]
+	stp	@a[2],@a[3],[$b_ptr,#16]
+	stp	@a[4],@a[5],[$b_ptr,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.type	__mul_mont_383_nonred,%function
+.align	5
+__mul_mont_383_nonred:
+	mul	@acc[0],@a[0],$bi
+	mul	@acc[1],@a[1],$bi
+	mul	@acc[2],@a[2],$bi
+	mul	@acc[3],@a[3],$bi
+	mul	@acc[4],@a[4],$bi
+	mul	@acc[5],@a[5],$bi
+	mul	$n0,$n0,@acc[0]
+
+	 umulh	@tmp[0],@a[0],$bi
+	 umulh	@tmp[1],@a[1],$bi
+	 umulh	@tmp[2],@a[2],$bi
+	 umulh	@tmp[3],@a[3],$bi
+	 umulh	@tmp[4],@a[4],$bi
+	 umulh	@tmp[5],@a[5],$bi
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],xzr,    @tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+___
+for ($i=1;$i<6;$i++) {
+$code.=<<___;
+	ldr	$bi,[$b_ptr,8*$i]
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adc	@acc[6],@acc[6],xzr
+
+	ldr	$n0,[x29,#96]
+	 adds	@acc[0],@acc[1],@tmp[0]
+	mul	@tmp[0],@a[0],$bi
+	 adcs	@acc[1],@acc[2],@tmp[1]
+	mul	@tmp[1],@a[1],$bi
+	 adcs	@acc[2],@acc[3],@tmp[2]
+	mul	@tmp[2],@a[2],$bi
+	 adcs	@acc[3],@acc[4],@tmp[3]
+	mul	@tmp[3],@a[3],$bi
+	 adcs	@acc[4],@acc[5],@tmp[4]
+	mul	@tmp[4],@a[4],$bi
+	 adcs	@acc[5],@acc[6],@tmp[5]
+	mul	@tmp[5],@a[5],$bi
+	 adc	@acc[6],xzr,xzr
+
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@a[0],$bi
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@a[1],$bi
+	adcs	@acc[2],@acc[2],@tmp[2]
+	mul	$n0,$n0,@acc[0]
+	 umulh	@tmp[2],@a[2],$bi
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@a[3],$bi
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@a[4],$bi
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@a[5],$bi
+	adc	@acc[6],@acc[6],xzr
+
+	 adds	@acc[1],@acc[1],@tmp[0]
+	mul	@tmp[0],@mod[0],$n0
+	 adcs	@acc[2],@acc[2],@tmp[1]
+	mul	@tmp[1],@mod[1],$n0
+	 adcs	@acc[3],@acc[3],@tmp[2]
+	mul	@tmp[2],@mod[2],$n0
+	 adcs	@acc[4],@acc[4],@tmp[3]
+	mul	@tmp[3],@mod[3],$n0
+	 adcs	@acc[5],@acc[5],@tmp[4]
+	mul	@tmp[4],@mod[4],$n0
+	 adc	@acc[6],@acc[6],@tmp[5]
+	mul	@tmp[5],@mod[5],$n0
+___
+}
+$code.=<<___;
+	adds	@acc[0],@acc[0],@tmp[0]
+	 umulh	@tmp[0],@mod[0],$n0
+	adcs	@acc[1],@acc[1],@tmp[1]
+	 umulh	@tmp[1],@mod[1],$n0
+	adcs	@acc[2],@acc[2],@tmp[2]
+	 umulh	@tmp[2],@mod[2],$n0
+	adcs	@acc[3],@acc[3],@tmp[3]
+	 umulh	@tmp[3],@mod[3],$n0
+	adcs	@acc[4],@acc[4],@tmp[4]
+	 umulh	@tmp[4],@mod[4],$n0
+	adcs	@acc[5],@acc[5],@tmp[5]
+	 umulh	@tmp[5],@mod[5],$n0
+	adc	@acc[6],@acc[6],xzr
+	 ldp	$n0,$b_ptr,[x29,#96]		// pull r_ptr
+
+	 adds	@a[0],@acc[1],@tmp[0]
+	 adcs	@a[1],@acc[2],@tmp[1]
+	 adcs	@a[2],@acc[3],@tmp[2]
+	 adcs	@a[3],@acc[4],@tmp[3]
+	 adcs	@a[4],@acc[5],@tmp[4]
+	 adcs	@a[5],@acc[6],@tmp[5]
+
+	ret
+.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,%function
+.align	5
+sgn0_pty_mont_384:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	$n0,$b_ptr
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+	mov	$a_ptr,$r_ptr
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	$r_ptr,@a[0],#1
+	adds	@a[0],@a[0],@a[0]
+	adcs	@a[1],@a[1],@a[1]
+	adcs	@a[2],@a[2],@a[2]
+	adcs	@a[3],@a[3],@a[3]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$r_ptr,$r_ptr,$bi
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,%function
+.align	5
+sgn0_pty_mont_384x:
+	paciasp
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	$n0,$b_ptr
+	ldp	@mod[0],@mod[1],[$a_ptr]
+	ldp	@mod[2],@mod[3],[$a_ptr,#16]
+	ldp	@mod[4],@mod[5],[$a_ptr,#32]
+	mov	$a_ptr,$r_ptr
+
+	bl	__mul_by_1_mont_384
+	add	$a_ptr,$a_ptr,#48
+
+	and	$b_ptr,@a[0],#1
+	 orr	$n_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$n_ptr,$n_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$n_ptr,$n_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$n_ptr,$n_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$n_ptr,$n_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$b_ptr,$b_ptr,$bi
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	$r_ptr,@a[0],#1
+	 orr	$a_ptr,@a[0],@a[1]
+	adds	@a[0],@a[0],@a[0]
+	 orr	$a_ptr,$a_ptr,@a[2]
+	adcs	@a[1],@a[1],@a[1]
+	 orr	$a_ptr,$a_ptr,@a[3]
+	adcs	@a[2],@a[2],@a[2]
+	 orr	$a_ptr,$a_ptr,@a[4]
+	adcs	@a[3],@a[3],@a[3]
+	 orr	$a_ptr,$a_ptr,@a[5]
+	adcs	@a[4],@a[4],@a[4]
+	adcs	@a[5],@a[5],@a[5]
+	adc	$bi,xzr,xzr
+
+	subs	@a[0],@a[0],@mod[0]
+	sbcs	@a[1],@a[1],@mod[1]
+	sbcs	@a[2],@a[2],@mod[2]
+	sbcs	@a[3],@a[3],@mod[3]
+	sbcs	@a[4],@a[4],@mod[4]
+	sbcs	@a[5],@a[5],@mod[5]
+	sbc	$bi,$bi,xzr
+
+	mvn	$bi,$bi
+	and	$bi,$bi,#2
+	orr	$r_ptr,$r_ptr,$bi
+
+	cmp	$n_ptr,#0
+	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	$a_ptr,#0
+	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	$n_ptr,$n_ptr,#1
+	and	$a_ptr,$a_ptr,#2
+	orr	$r_ptr,$a_ptr,$n_ptr		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	autiasp
+	ret
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+___
+
+if (0) {
+my @b = ($bi, @mod[0..4]);
+my @comba = @acc[4..6];
+
+$code.=<<___;
+.type	__mul_384_comba,%function
+.align	5
+__mul_384_comba:
+	ldp	@a[0],@a[1],[$a_ptr]
+	ldp	@b[0],@b[1],[$b_ptr]
+	ldp	@a[2],@a[3],[$a_ptr,#16]
+	ldp	@a[4],@a[5],[$a_ptr,#32]
+	ldp	@b[2],@b[3],[$b_ptr,#16]
+	ldp	@b[4],@b[5],[$b_ptr,#32]
+
+	mul	@comba[0],@a[0],@b[0]
+	umulh	@comba[1],@a[0],@b[0]
+	 mul	@acc[0],@a[1],@b[0]
+	 umulh	@acc[1],@a[1],@b[0]
+	str	@comba[0],[$r_ptr]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[0],@b[1]
+	umulh	@acc[3],@a[0],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],xzr,      @acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[2],@b[0]
+	umulh	@acc[1],@a[2],@b[0]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#8]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[1],@b[1]
+	umulh	@acc[3],@a[1],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[0],@b[2]
+	umulh	@acc[1],@a[0],@b[2]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[3],@b[0]
+	 umulh	@acc[3],@a[3],@b[0]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#16]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[2],@b[1]
+	umulh	@acc[1],@a[2],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[1],@b[2]
+	umulh	@acc[3],@a[1],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[0],@b[3]
+	umulh	@acc[1],@a[0],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[4],@b[0]
+	 umulh	@acc[3],@a[4],@b[0]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#24]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[3],@b[1]
+	umulh	@acc[1],@a[3],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[2],@b[2]
+	umulh	@acc[3],@a[2],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[3]
+	umulh	@acc[1],@a[1],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[0],@b[4]
+	umulh	@acc[3],@a[0],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[0]
+	 umulh	@acc[1],@a[5],@b[0]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#32]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[1]
+	umulh	@acc[3],@a[4],@b[1]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[3],@b[2]
+	umulh	@acc[1],@a[3],@b[2]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[2],@b[3]
+	umulh	@acc[3],@a[2],@b[3]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[4]
+	umulh	@acc[1],@a[1],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[0],@b[5]
+	umulh	@acc[3],@a[0],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[1]
+	 umulh	@acc[1],@a[5],@b[1]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#40]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[2]
+	umulh	@acc[3],@a[4],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[0],@a[3],@b[3]
+	umulh	@acc[1],@a[3],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[2],@a[2],@b[4]
+	umulh	@acc[3],@a[2],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[1],@b[5]
+	umulh	@acc[1],@a[1],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[5],@b[2]
+	 umulh	@acc[3],@a[5],@b[2]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#48]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[4],@b[3]
+	umulh	@acc[1],@a[4],@b[3]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[3],@b[4]
+	umulh	@acc[3],@a[3],@b[4]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	mul	@acc[0],@a[2],@b[5]
+	umulh	@acc[1],@a[2],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[2],@a[5],@b[3]
+	 umulh	@acc[3],@a[5],@b[3]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#56]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[0],@a[4],@b[4]
+	umulh	@acc[1],@a[4],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],xzr,xzr
+	mul	@acc[2],@a[3],@b[5]
+	umulh	@acc[3],@a[3],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],@comba[2],xzr
+	 mul	@acc[0],@a[5],@b[4]
+	 umulh	@acc[1],@a[5],@b[4]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#64]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	mul	@acc[2],@a[4],@b[5]
+	umulh	@acc[3],@a[4],@b[5]
+	adds	@comba[0],@comba[0],@acc[0]
+	adcs	@comba[1],@comba[1],@acc[1]
+	adc	@comba[2],xzr,xzr
+	 mul	@acc[0],@a[5],@b[5]
+	 umulh	@acc[1],@a[5],@b[5]
+	adds	@comba[0],@comba[0],@acc[2]
+	adcs	@comba[1],@comba[1],@acc[3]
+	adc	@comba[2],@comba[2],xzr
+	str	@comba[0],[$r_ptr,#72]
+___
+	push(@comba,shift(@comba));
+$code.=<<___;
+	adds	@comba[0],@comba[0],@acc[0]
+	adc	@comba[1],@comba[1],@acc[1]
+	stp	@comba[0],@comba[1],[$r_ptr,#80]
+
+	ret
+.size	__mul_384_comba,.-__mul_384_comba
+___
+}
+print $code;
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl
new file mode 100755
index 00000000000..12e58bb001e
--- /dev/null
+++ b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl
@@ -0,0 +1,513 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# As for "sparse" in subroutine names, see commentary in the
+# asm/mulx_mont_256-x86_64.pl module.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 256 bits
+my @acc=map("%r$_",(9..15));
+
+{ ############################################################## mulq
+my ($hi, $a0) = ("%rbp", $r_ptr);
+
+$code.=<<___;
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), @acc[3]
+	mov	8*3($a_ptr), $hi
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+
+	mov	%rax, @acc[6]
+	mulq	@acc[4]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	8*0($a_ptr), %rax
+	mov	$n_ptr, $n0
+	mov	8*1($a_ptr), @acc[5]
+	mov	$b_org, $n_ptr
+	mov	8*2($a_ptr), @acc[3]
+	lea	($a_ptr), $b_ptr
+	mov	8*3($a_ptr), $hi
+
+	mov	%rax, @acc[6]
+	mulq	%rax			# a[0]*a[0]
+	mov	%rax, @acc[0]
+	mov	@acc[6], %rax
+	mov	%rdx, @acc[1]
+	call	__mulq_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulq_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulq_mont_sparse_256:
+	mulq	@acc[5]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[3]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	$hi			# a[3]*b[0]
+	add	%rax, @acc[3]
+	 mov	8($b_ptr), %rax
+	adc	\$0, %rdx
+	xor	@acc[5], @acc[5]
+	mov	%rdx, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	mov	@acc[0], $a0
+	imulq	$n0, @acc[0]
+
+	################################# Multiply by b[$i]
+	mov	%rax, @acc[6]
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[4]
+	 mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	%rdx, @acc[5]		# can't overflow
+	xor	@acc[6], @acc[6]
+
+	################################# reduction
+	mulq	8*0($n_ptr)
+	add	%rax, $a0		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, $a0
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$a0, @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	 mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+	adc	\$0, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rax
+	mov	8(%rsp), $a_ptr		# restore $r_ptr
+
+	################################# last reduction
+	mov	%rax, @acc[6]
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[0]		# guaranteed to be zero
+	mov	@acc[6], %rax
+	adc	%rdx, @acc[0]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	@acc[0], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	 mov	@acc[2], $b_ptr
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	 mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	%rdx, @acc[4]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional subtraction of modulus
+
+	 mov	@acc[3], @acc[0]
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rax, @acc[1]
+	cmovc	$b_ptr, @acc[2]
+	cmovc	@acc[0], @acc[3]
+	mov	@acc[1], 8*0($a_ptr)
+	cmovc	$hi, @acc[4]
+	mov	@acc[2], 8*1($a_ptr)
+	mov	@acc[3], 8*2($a_ptr)
+	mov	@acc[4], 8*3($a_ptr)
+
+	ret
+.cfi_endproc
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,\@function,4,"unwind"
+.align	32
+from_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], @acc[1]
+	mov	@acc[6], @acc[2]
+	mov	@acc[0], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	sbb	8*3($n_ptr), @acc[0]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,\@function,4,"unwind"
+.align	32
+redc_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[6]
+	mov	@acc[5], @acc[1]
+	adc	8*7($a_ptr), @acc[0]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[6], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[6]
+	mov	@acc[0], @acc[3]
+	sbb	8*3($n_ptr), @acc[0]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], @acc[1]
+	cmovnc	@acc[6], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[0], @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redc_mont_256,.-redc_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulq_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulq_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl
new file mode 100755
index 00000000000..3812319e8ba
--- /dev/null
+++ b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl
@@ -0,0 +1,2675 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+########################################################################
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+########################################################################
+# Double-width subtraction modulo n<<384, as opposite to naively
+# expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__sub_mod_384_a_is_loaded:
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+___
+}
+
+########################################################################
+# "Complex" multiplication and squaring. Use vanilla multiplication when
+# possible to fold reductions. I.e. instead of mul_mont, mul_mont
+# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
+# followed by *common* reduction...
+{ my $frame = 5*8 +	# place for argument off-load +
+	      3*768/8;	# place for 3 768-bit temporary vectors
+$code.=<<___;
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,\@function,5,"unwind"
+.align	32
+mul_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	mov	$r_ptr, 8*4(%rsp)	# offload arguments
+	mov	$a_ptr, 8*3(%rsp)
+	mov	$b_org, 8*2(%rsp)
+	mov	$n_ptr, 8*1(%rsp)
+	mov	$n0,    8*0(%rsp)
+
+	################################# mul_384(t0, a->re, b->re);
+	#lea	0($b_btr), $b_ptr	# b->re
+	#lea	0($a_ptr), $a_ptr	# a->re
+	lea	40(%rsp), $r_ptr	# t0
+	call	__mulq_384
+
+	################################# mul_384(t1, a->im, b->im);
+	lea	48($b_ptr), $b_ptr	# b->im
+	lea	48($a_ptr), $a_ptr	# a->im
+	lea	40+96(%rsp), $r_ptr	# t1
+	call	__mulq_384
+
+	################################# mul_384(t2, a->re+a->im, b->re+b->im);
+	mov	8*1(%rsp), $n_ptr
+	lea	-48($a_ptr), $b_org
+	lea	40+192+48(%rsp), $r_ptr
+	call	__add_mod_384
+
+	mov	8*2(%rsp), $a_ptr
+	lea	48($a_ptr), $b_org
+	lea	-48($r_ptr), $r_ptr
+	call	__add_mod_384
+
+	lea	($r_ptr),$b_ptr
+	lea	48($r_ptr),$a_ptr
+	call	__mulq_384
+
+	################################# t2=t2-t0-t1
+	lea	($r_ptr), $a_ptr	# t2
+	lea	40(%rsp), $b_org	# t0
+	mov	8*1(%rsp), $n_ptr
+	call	__sub_mod_384x384	# t2=t2-t0
+
+	lea	($r_ptr), $a_ptr	# t2
+	lea	-96($r_ptr), $b_org	# t1
+	call	__sub_mod_384x384	# t2=t2-t1
+
+	################################# t0=t0-t1
+	lea	40(%rsp), $a_ptr
+	lea	40+96(%rsp), $b_org
+	lea	40(%rsp), $r_ptr
+	call	__sub_mod_384x384	# t0-t1
+
+	mov	$n_ptr, $b_ptr		# n_ptr for redc_mont_384
+
+	################################# redc_mont_384(ret->re, t0, mod, n0);
+	lea	40(%rsp), $a_ptr	# t0
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	mov	8*4(%rsp), $r_ptr	# ret->re
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	################################# redc_mont_384(ret->im, t2, mod, n0);
+	lea	40+192(%rsp), $a_ptr	# t2
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_384x,.-mul_mont_384x
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+$code.=<<___;
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,\@function,4,"unwind"
+.align	32
+sqr_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$a_ptr, 8*2(%rsp)
+
+	################################# add_mod_384(t0, a->re, a->im);
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32(%rsp), $r_ptr	# t0
+	call	__add_mod_384
+
+	################################# sub_mod_384(t1, a->re, a->im);
+	mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32+48(%rsp), $r_ptr	# t1
+	call	__sub_mod_384
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rax	# a->im
+	mov	8*0($a_ptr), @acc[6]	# a->re
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+
+	call	__mulq_mont_384
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0],  8*6($r_ptr)	# ret->im
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1],  8*7($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2],  8*8($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3],  8*9($r_ptr)
+	mov	@acc[4],  8*10($r_ptr)
+	mov	@acc[5],  8*11($r_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+48(%rsp), $b_ptr	# t1
+
+	mov	32+48(%rsp), %rax	# t1[0]
+	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
+	mov	32+8*1(%rsp), @acc[7]
+	mov	32+8*2(%rsp), @acc[4]
+	mov	32+8*3(%rsp), @acc[5]
+
+	call	__mulq_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,\@function,4,"unwind"
+.align	32
+mul_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	96($r_ptr), $r_ptr	# ret->im
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
+	mov	$n_ptr, 8*3(%rsp)
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*6($a_ptr), @acc[0]
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+
+	mov	@acc[0], 32+8*0(%rsp)
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	################################# t1 = b->re + b->im
+	mov	8*0($b_org), @acc[0]
+	mov	8*1($b_org), @acc[1]
+	mov	8*2($b_org), @acc[2]
+	mov	8*3($b_org), @acc[3]
+	mov	8*4($b_org), @acc[4]
+	mov	8*5($b_org), @acc[5]
+
+	add	8*6($b_org), @acc[0]
+	adc	8*7($b_org), @acc[1]
+	adc	8*8($b_org), @acc[2]
+	adc	8*9($b_org), @acc[3]
+	adc	8*10($b_org), @acc[4]
+	adc	8*11($b_org), @acc[5]
+
+	mov	@acc[0], 32+8*6(%rsp)
+	mov	@acc[1], 32+8*7(%rsp)
+	mov	@acc[2], 32+8*8(%rsp)
+	mov	@acc[3], 32+8*9(%rsp)
+	mov	@acc[4], 32+8*10(%rsp)
+	mov	@acc[5], 32+8*11(%rsp)
+
+	################################# mul_384(ret->im, t0, t1);
+	lea	32+8*0(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+	call	__mulq_384
+
+	################################# mul_384(ret->re, a->re, b->re);
+	mov	8*0(%rsp), $a_ptr
+	mov	8*1(%rsp), $b_ptr
+	lea	-96($r_ptr), $r_ptr	# ret->re
+	call	__mulq_384
+
+	################################# mul_384(tx, a->im, b->im);
+	lea	48($a_ptr), $a_ptr
+	lea	48($b_ptr), $b_ptr
+	lea	32(%rsp), $r_ptr
+	call	__mulq_384
+
+	################################# ret->im -= tx
+	mov	8*2(%rsp), $a_ptr	# restore ret->im
+	lea	32(%rsp), $b_org
+	mov	8*3(%rsp), $n_ptr
+	mov	$a_ptr, $r_ptr
+	call	__sub_mod_384x384
+
+	################################# ret->im -= ret->re
+	lea	0($r_ptr), $a_ptr
+	lea	-96($r_ptr), $b_org
+	call	__sub_mod_384x384
+
+	################################# ret->re -= tx
+	lea	-96($r_ptr), $a_ptr
+	lea	32(%rsp), $b_org
+	lea	-96($r_ptr), $r_ptr
+	call	__sub_mod_384x384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_382x,.-mul_382x
+___
+}
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,\@function,3,"unwind"
+.align	32
+sqr_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+	mov	8*5($a_ptr), @acc[11]
+
+	mov	@acc[6], @acc[0]
+	add	8*6($a_ptr), @acc[6]
+	mov	@acc[7], @acc[1]
+	adc	8*7($a_ptr), @acc[7]
+	mov	@acc[8], @acc[2]
+	adc	8*8($a_ptr), @acc[8]
+	mov	@acc[9], @acc[3]
+	adc	8*9($a_ptr), @acc[9]
+	mov	@acc[10], @acc[4]
+	adc	8*10($a_ptr), @acc[10]
+	mov	@acc[11], @acc[5]
+	adc	8*11($a_ptr), @acc[11]
+
+	mov	@acc[6], 8*0($r_ptr)
+	mov	@acc[7], 8*1($r_ptr)
+	mov	@acc[8], 8*2($r_ptr)
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	################################# t1 = a->re - a->im
+	lea	48($a_ptr), $b_org
+	lea	48($r_ptr), $r_ptr
+	call	__sub_mod_384_a_is_loaded
+
+	################################# mul_384(ret->re, t0, t1);
+	lea	($r_ptr), $a_ptr
+	lea	-48($r_ptr), $b_ptr
+	lea	-48($r_ptr), $r_ptr
+	call	__mulq_384
+
+	################################# mul_384(ret->im, a->re, a->im);
+	mov	(%rsp), $a_ptr
+	lea	48($a_ptr), $b_ptr
+	lea	96($r_ptr), $r_ptr
+	call	__mulq_384
+
+	mov	8*0($r_ptr), @acc[0]	# double ret->im
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+	mov	8*6($r_ptr), @acc[6]
+	mov	8*7($r_ptr), @acc[7]
+	mov	8*8($r_ptr), @acc[8]
+	mov	8*9($r_ptr), @acc[9]
+	mov	8*10($r_ptr), @acc[10]
+	add	@acc[0], @acc[0]
+	mov	8*11($r_ptr), @acc[11]
+	adc	@acc[1], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[2], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[3], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[4], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[5], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	adc	@acc[6], @acc[6]
+	mov	@acc[5], 8*5($r_ptr)
+	adc	@acc[7], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[8], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[9], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[10], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[11], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	mov	8*1(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*2(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*3(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*4(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*5(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*6(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*7(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_382x,.-sqr_382x
+___
+}
+{ ########################################################## 384-bit mul
+my @acc=map("%r$_",("cx",8..12));
+my $bi = "%rbp";
+
+$code.=<<___;
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,\@function,3,"unwind"
+.align	32
+mul_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	call	__mulq_384
+
+	mov	0(%rsp),%r12
+.cfi_restore	%r12
+	mov	8(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	16(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_384,.-mul_384
+
+.type	__mulq_384,\@abi-omnipotent
+.align	32
+__mulq_384:
+	mov	8*0($b_ptr), %rax
+
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	mov	%rax, 8*0($r_ptr)
+	mov	$bi, %rax
+	mov	%rdx, @acc[0]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[1]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[4]
+	mov	8*1($b_ptr), %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+___
+for(my $i=1; $i<6; $i++) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
+$code.=<<___;
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	@acc[0], 8*$i($r_ptr)
+	mov	%rdx, @acc[0]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[1], @acc[0]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[1]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[2], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[3], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	@acc[5], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+___
+}
+$code.=<<___;
+	mov	@acc[0], 8*6($r_ptr)
+	mov	@acc[1], 8*7($r_ptr)
+	mov	@acc[2], 8*8($r_ptr)
+	mov	@acc[3], 8*9($r_ptr)
+	mov	@acc[4], 8*10($r_ptr)
+	mov	@acc[5], 8*11($r_ptr)
+
+	ret
+.size	__mulq_384,.-__mulq_384
+___
+}
+if (0) { ##############################################################
+my @b=map("%r$_",(10..15));
+my @a=reverse(@b);
+   @b[5]=$b_ptr;
+my $bi = "%rbp";
+my @comba=map("%r$_",("cx",8,9));
+#                                                   a[0]*b[0]
+#                                              a[1]*b[0]
+#                                              a[0]*b[1]
+#                                         a[2]*b[0]
+#                                         a[1]*b[1]
+#                                         a[0]*b[2]
+#                                    a[3]*b[0]
+#                                    a[2]*b[1]
+#                                    a[1]*b[2]
+#                                    a[0]*b[3]
+#                               a[4]*b[0]
+#                               a[3]*b[1]
+#                               a[2]*b[2]
+#                               a[1]*b[3]
+#                               a[0]*b[4]
+#                          a[5]*b[0]
+#                          a[4]*b[1]
+#                          a[3]*b[2]
+#                          a[2]*b[3]
+#                          a[1]*b[4]
+#                          a[0]*b[5]
+#                     a[5]*b[1]
+#                     a[4]*b[2]
+#                     a[3]*b[3]
+#                     a[2]*b[4]
+#                     a[1]*b[5]
+#                a[5]*b[2]
+#                a[4]*b[3]
+#                a[3]*b[4]
+#                a[2]*b[5]
+#           a[5]*b[3]
+#           a[4]*b[4]
+#           a[3]*b[5]
+#      a[5]*b[4]
+#      a[4]*b[5]
+# a[5]*b[5]
+#
+# 13% less instructions give +15% on Core2, +10% on Goldmont,
+# -0% on Sandy Bridge, but -16% on Haswell:-(
+# [for reference +5% on Skylake, +11% on Ryzen]
+
+$code.=<<___;
+.type	__mulq_comba_384,\@abi-omnipotent
+.align	32
+__mulq_comba_384:
+	mov	8*0($b_ptr), %rax
+	mov	8*0($a_ptr), @a[0]
+	mov	8*1($a_ptr), @a[1]
+	mov	8*1($b_ptr), @b[1]
+
+	mov	%rax, @b[0]
+	mulq	@a[0]			# a[0]*b[0]
+	mov	%rax, 8*0($r_ptr)
+	mov	@b[0], %rax
+	mov	%rdx, @comba[0]
+
+	#################################
+	mov	8*2($a_ptr), @a[2]
+	xor	@comba[2], @comba[2]
+	mulq	@a[1]			# a[1]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	\$0, %rdx
+	mov	8*2($b_ptr), @b[2]
+	mov	%rdx, @comba[1]
+
+	mulq	@a[0]			# a[0]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*1($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[2]			# a[2]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[0]			# a[0]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*2($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	8*3($a_ptr)		# a[3]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[2]			# a[2]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[2]
+	add	%rax, @comba[0]
+	mov	8*3($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[3]
+	mulq	@a[0]			# a[0]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*3($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	8*4($a_ptr)		# a[4]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[1]			# a[1]*b[3]
+	add	%rax, @comba[0]
+	mov	8*4($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[4]
+	mulq	@a[0]			# a[0]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[0], %rax
+	adc	%rdx, @comba[1]
+	mov	8*5($a_ptr), @a[5]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*4($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[0]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*4($a_ptr)		# a[4]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*1($a_ptr)		# a[1]*b[4]
+	add	%rax, @comba[0]
+	mov	8*5($b_ptr), %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mov	%rax, @b[5]
+	mulq	@a[0]			# a[0]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[1], %rax
+	adc	%rdx, @comba[1]
+	mov	8*4($a_ptr), @a[4]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*5($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[1]
+	add	%rax, @comba[0]
+	mov	@b[2], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*3($a_ptr)		# a[3]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*1($a_ptr)		# a[1]*b[5]
+	add	%rax, @comba[0]
+	mov	$b[2], %rax
+	adc	%rdx, @comba[1]
+	mov	8*3($a_ptr), @a[3]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*6($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[2]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[3]			# a[3]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	8*2($a_ptr)		# a[2]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[3], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*7($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[3]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[3]			# a[3]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[4], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*8($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	xor	@comba[2], @comba[2]
+	mulq	@a[5]			# a[5]*b[4]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+
+	mulq	@a[4]			# a[4]*b[5]
+	add	%rax, @comba[0]
+	mov	@b[5], %rax
+	adc	%rdx, @comba[1]
+	adc	\$0, @comba[2]
+	mov	@comba[0], 8*9($r_ptr)
+___
+    push(@comba,shift(@comba));
+$code.=<<___;
+	mulq	@a[5]			# a[5]*b[4]
+	add	%rax, @comba[0]
+	adc	%rdx, @comba[1]
+
+	mov	@comba[0], 8*10($r_ptr)
+	mov	@comba[1], 8*11($r_ptr)
+
+	ret
+.size	__mulq_comba_384,.-__mulq_comba_384
+___
+}
+{ ########################################################## 384-bit sqr
+my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr);
+my $hi;
+
+$code.=<<___;
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,\@function,2,"unwind"
+.align	32
+sqr_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sqrq_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_384,.-sqr_384
+
+.type	__sqrq_384,\@abi-omnipotent
+.align	32
+__sqrq_384:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+
+	#########################################
+	mov	%rax, @acc[6]
+	mulq	@acc[7]				# a[1]*a[0]
+	mov	%rax, @acc[1]
+	mov	@acc[6], %rax
+	 mov	8*4($a_ptr), @acc[10]
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[8]				# a[2]*a[0]
+	add	%rax, @acc[2]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	 mov	8*5($a_ptr), @acc[11]
+	mov	%rdx, @acc[3]
+
+	mulq	@acc[9]				# a[3]*a[0]
+	add	%rax, @acc[3]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	@acc[10]			# a[4]*a[0]
+	add	%rax, @acc[4]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	@acc[11]			# a[5]*a[0]
+	add	%rax, @acc[5]
+	mov	@acc[6], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	%rax				# a[0]*a[0]
+	xor	@acc[0], @acc[0]
+	mov	%rax, 8*0($r_ptr)
+	 mov	@acc[7], %rax
+	add	@acc[1], @acc[1]		# double acc[1]
+	adc	\$0, @acc[0]
+	add	%rdx, @acc[1]			# accumulate a[0]*a[0]
+	adc	\$0, @acc[0]			# carries to a[1]*a[1]
+	mov	@acc[1], 8*1($r_ptr)
+___
+$hi=@acc[1];
+$code.=<<___;
+	#########################################
+	mulq	@acc[8]				# a[2]*a[1]
+	add	%rax, @acc[3]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[9]				# a[3]*a[1]
+	add	%rax, @acc[4]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[10]			# a[4]*a[1]
+	add	%rax, @acc[5]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[1]
+	add	%rax, @acc[6]
+	mov	@acc[7], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[6]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	%rax				# a[1]*a[1]
+	xor	@acc[1], @acc[1]
+	add	%rax, @acc[0]			# can't carry
+	 mov	@acc[8], %rax
+	add	@acc[2], @acc[2]		# double acc[2:3]
+	adc	@acc[3], @acc[3]
+	adc	\$0, @acc[1]
+	add	@acc[0], @acc[2]		# accumulate a[1]*a[1]
+	adc	%rdx, @acc[3]
+	adc	\$0, @acc[1]			# carries to a[2]*a[2]
+	mov	@acc[2], 8*2($r_ptr)
+___
+$hi=@acc[0];
+$code.=<<___;
+	#########################################
+	mulq	@acc[9]				# a[3]*a[2]
+	add	%rax, @acc[5]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	 mov	@acc[3], 8*3($r_ptr)
+	mov	%rdx, $hi
+
+	mulq	@acc[10]			# a[4]*a[2]
+	add	%rax, @acc[6]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[6]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[2]
+	add	%rax, @acc[7]
+	mov	@acc[8], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[7]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[8]
+
+	mulq	%rax				# a[2]*a[2]
+	xor	@acc[3], @acc[3]
+	add	%rax, @acc[1]			# can't carry
+	 mov	@acc[9], %rax
+	add	@acc[4], @acc[4]		# double acc[4:5]
+	adc	@acc[5], @acc[5]
+	adc	\$0, @acc[3]
+	add	@acc[1], @acc[4]		# accumulate a[2]*a[2]
+	adc	%rdx, @acc[5]
+	adc	\$0, @acc[3]			# carries to a[3]*a[3]
+	mov	@acc[4], 8*4($r_ptr)
+
+	#########################################
+	mulq	@acc[10]			# a[4]*a[3]
+	add	%rax, @acc[7]
+	mov	@acc[9], %rax
+	adc	\$0, %rdx
+	 mov	@acc[5], 8*5($r_ptr)
+	mov	%rdx, $hi
+
+	mulq	@acc[11]			# a[5]*a[3]
+	add	%rax, @acc[8]
+	mov	@acc[9], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[8]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[9]
+
+	mulq	%rax				# a[3]*a[3]
+	xor	@acc[4], @acc[4]
+	add	%rax, @acc[3]			# can't carry
+	 mov	@acc[10], %rax
+	add	@acc[6], @acc[6]		# double acc[6:7]
+	adc	@acc[7], @acc[7]
+	adc	\$0, @acc[4]
+	add	@acc[3], @acc[6]		# accumulate a[3]*a[3]
+	adc	%rdx, @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	\$0, @acc[4]			# carries to a[4]*a[4]
+	mov	@acc[7], 8*7($r_ptr)
+
+	#########################################
+	mulq	@acc[11]			# a[5]*a[4]
+	add	%rax, @acc[9]
+	mov	@acc[10], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[10]
+
+	mulq	%rax				# a[4]*a[4]
+	xor	@acc[5], @acc[5]
+	add	%rax, @acc[4]			# can't carry
+	 mov	@acc[11], %rax
+	add	@acc[8], @acc[8]		# double acc[8:9]
+	adc	@acc[9], @acc[9]
+	adc	\$0, @acc[5]
+	add	@acc[4], @acc[8]		# accumulate a[4]*a[4]
+	adc	%rdx, @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	\$0, @acc[5]			# carries to a[5]*a[5]
+	mov	@acc[9], 8*9($r_ptr)
+
+	#########################################
+	mulq	%rax				# a[5]*a[5]
+	add	@acc[5], %rax			# can't carry
+	add	@acc[10], @acc[10]		# double acc[10]
+	adc	\$0, %rdx
+	add	@acc[10], %rax			# accumulate a[5]*a[5]
+	adc	\$0, %rdx
+	mov	%rax, 8*10($r_ptr)
+	mov	%rdx, 8*11($r_ptr)
+
+	ret
+.size	__sqrq_384,.-__sqrq_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,\@function,4,"unwind"
+.align	32
+sqr_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*15, %rsp
+.cfi_adjust_cfa_offset	8*15
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*12(%rsp)	# n0
+	mov	$b_org, 8*13(%rsp)	# n_ptr
+	mov	$r_ptr, 8*14(%rsp)
+
+	mov	%rsp, $r_ptr
+	call	__sqrq_384
+
+	lea	0(%rsp), $a_ptr
+	mov	8*12(%rsp), %rcx	# n0 for mul_by_1
+	mov	8*13(%rsp), $b_ptr	# n_ptr for mul_by_1
+	mov	8*14(%rsp), $r_ptr
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	8*15(%rsp), %r8		# size optimization
+	mov	8*15(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*21
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_384,.-sqr_mont_384
+___
+}
+{ ########################################################## 384-bit redc_mont
+my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+########################################################################
+# void redc_mont_384(uint64_t ret[6], const uint64_t a[12],
+#                    uint64_t m[6], uint64_t n0);
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,\@function,4,"unwind"
+.align	32
+redc_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redc_mont_384,.-redc_mont_384
+
+########################################################################
+# void from_mont_384(uint64_t ret[6], const uint64_t a[6],
+#                    uint64_t m[6], uint64_t n0);
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,\@function,4,"unwind"
+.align	32
+from_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulq_by_1_mont_384
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	#mov	@acc[6], %rax		# __mulq_by_1_mont_384 does it
+	mov	@acc[7], %rcx
+	mov	@acc[0], %rdx
+	mov	@acc[1], %rbp
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	mov	@acc[2], @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	8*4($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*5($n_ptr), @acc[3]
+
+	cmovc	%rax, @acc[6]
+	cmovc	%rcx, @acc[7]
+	cmovc	%rdx, @acc[0]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovc	%rbp, @acc[1]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovc	@acc[5], @acc[2]
+	mov	@acc[0], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	from_mont_384,.-from_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulq_by_1_mont_384,\@abi-omnipotent
+.align	32
+__mulq_by_1_mont_384:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	%rax, @acc[6]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<6; $i++) {
+my $hi = @acc[6];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[6]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[6]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+___
+$code.=<<___	if ($i<5);
+	 mov	@acc[1], @acc[7]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
+
+.type	__redc_tail_mont_384,\@abi-omnipotent
+.align	32
+__redc_tail_mont_384:
+	add	8*6($a_ptr), @acc[0]	# accumulate upper half
+	mov	@acc[0], %rax
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[1], %rcx
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+	sbb	@acc[6], @acc[6]
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[2], %rdx
+	mov	@acc[3], %rbp
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[4], @acc[7]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], $a_ptr
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax, @acc[0]
+	cmovc	%rcx, @acc[1]
+	cmovc	%rdx, @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	%rbp, @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,\@function,3,"unwind"
+.align	32
+sgn0_pty_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	0($r_ptr), $a_ptr
+	mov	$b_org, $n0
+	call	__mulq_by_1_mont_384
+
+	xor	%rax, %rax
+	mov	@acc[0], @acc[7]
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,\@function,3,"unwind"
+.align	32
+sgn0_pty_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
+	mov	$b_org, $n0
+	call	__mulq_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	call	__mulq_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+___
+} }
+
+{ ########################################################## mulq_mont
+my ($bi, $hi) = ("%rdi", "%rbp");
+
+$code.=<<___;
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,\@function,5,"unwind"
+.align	32
+mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*3, %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	8*0($b_org), %rax
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	$n0,    8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+
+	call	__mulq_mont_384
+
+	mov	24(%rsp),%r15
+.cfi_restore	%r15
+	mov	32(%rsp),%r14
+.cfi_restore	%r14
+	mov	40(%rsp),%r13
+.cfi_restore	%r13
+	mov	48(%rsp),%r12
+.cfi_restore	%r12
+	mov	56(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	64(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-72
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mul_mont_384,.-mul_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulq_mont_384,\@abi-omnipotent
+.align	32
+__mulq_mont_384:
+	mov	%rax, $bi
+	mulq	@acc[6]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	$bi, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	@acc[7]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[4]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	 mov	@acc[0], $hi
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	@acc[5]			# a[3]*b[0]
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	xor	@acc[7], @acc[7]
+	mov	%rdx, @acc[6]
+___
+for (my $i=0; $i<6;) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, $hi		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, $hi
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[5]
+	adc	%rdx, @acc[6]
+	adc	\$0, @acc[7]
+___
+    push(@acc,shift(@acc));
+$code.=<<___	if ($i++<5);
+	################################# Multiply by b[$i]
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	 mov	@acc[0], $hi
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*5($a_ptr)
+	add	@acc[7], @acc[5]
+	adc	\$0, %rdx
+	xor	@acc[7], @acc[7]
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[6]
+	adc	\$0, @acc[7]
+___
+}
+$code.=<<___;
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	#mov	@acc[0], %rax
+	mov	8*2(%rsp), $r_ptr	# restore $r_ptr
+	sub	8*0($n_ptr), @acc[0]
+	mov	@acc[1], %rdx
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[2], $b_ptr
+	sbb	8*2($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*3($n_ptr), @acc[3]
+	mov	@acc[4], $hi
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], @acc[7]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax,    @acc[0]
+	cmovc	%rdx,    @acc[1]
+	cmovc	$b_ptr,  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	$hi,     @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[7], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__mulq_mont_384,.-__mulq_mont_384
+___
+} }
+$code.=<<___;
+.globl	sqr_n_mul_mont_384
+.hidden	sqr_n_mul_mont_384
+.type	sqr_n_mul_mont_384,\@function,6,"unwind"
+.align	32
+sqr_n_mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*17, %rsp
+.cfi_adjust_cfa_offset	8*17
+.cfi_end_prologue
+
+	mov	$n0,    8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$n_ptr, 8*2(%rsp)
+	lea	8*4(%rsp), $r_ptr
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqr_384:
+	movd	%edx, %xmm1		# loop counter
+
+	call	__sqrq_384
+
+	lea	0($r_ptr), $a_ptr
+	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
+	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1, %edx
+	lea	0($r_ptr), $a_ptr
+	dec	%edx
+	jnz	.Loop_sqr_384
+
+	movq	%xmm2, %rax		# b[0]
+	mov	$b_ptr, $n_ptr
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	#mov	8*0($b_ptr), %rax
+	#mov	8*0($a_ptr), @acc[6]
+	#mov	8*1($a_ptr), @acc[7]
+	#mov	8*2($a_ptr), @acc[4]
+	#mov	8*3($a_ptr), @acc[5]
+	mov	@acc[0], @acc[4]
+	mov	@acc[1], @acc[5]
+
+	call	__mulq_mont_384
+
+	lea	8*17(%rsp), %r8		# size optimization
+	mov	8*17(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*23
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_n_mul_mont_384,.-sqr_n_mul_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,\@function,6,"unwind"
+.align	32
+sqr_n_mul_mont_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8*17, %rsp
+.cfi_adjust_cfa_offset	8*17
+.cfi_end_prologue
+
+	mov	$n0, 8*0(%rsp)
+	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
+	mov	$n_ptr, 8*2(%rsp)
+	lea	8*4(%rsp), $r_ptr
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqr_383:
+	movd	%edx, %xmm1		# loop counter
+
+	call	__sqrq_384
+
+	lea	0($r_ptr), $a_ptr
+	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
+	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1, %edx		# loop counter
+        add     8*6($a_ptr), @acc[6]	# just accumulate upper half
+        adc     8*7($a_ptr), @acc[7]
+        adc     8*8($a_ptr), @acc[0]
+        adc     8*9($a_ptr), @acc[1]
+        adc     8*10($a_ptr), @acc[2]
+        adc     8*11($a_ptr), @acc[3]
+	lea	0($r_ptr), $a_ptr
+
+	mov	@acc[6], 8*0($r_ptr)	# omitting full reduction gives ~5%
+	mov	@acc[7], 8*1($r_ptr)	# in addition-chains
+	mov	@acc[0], 8*2($r_ptr)
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	dec	%edx
+	jnz	.Loop_sqr_383
+
+	movq	%xmm2, %rax		# b[0]
+	mov	$b_ptr, $n_ptr
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	#movq	8*0($b_ptr), %rax
+	#mov	8*0($a_ptr), @acc[6]
+	#mov	8*1($a_ptr), @acc[7]
+	#mov	8*2($a_ptr), @acc[4]
+	#mov	8*3($a_ptr), @acc[5]
+	mov	@acc[0], @acc[4]
+	mov	@acc[1], @acc[5]
+
+	call	__mulq_mont_384		# formally one can omit full reduction
+					# even after multiplication...
+	lea	8*17(%rsp), %r8		# size optimization
+	mov	8*17(%rsp), %r15
+.cfi_restore	%r15
+	mov	8*1(%r8), %r14
+.cfi_restore	%r14
+	mov	8*2(%r8), %r13
+.cfi_restore	%r13
+	mov	8*3(%r8), %r12
+.cfi_restore	%r12
+	mov	8*4(%r8), %rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8), %rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8), %rsp
+.cfi_adjust_cfa_offset	-8*23
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+___
+{ my @acc=@acc;				# will be rotated locally
+  my $bi = "%rbp";
+
+$code.=<<___;
+.type	__mulq_mont_383_nonred,\@abi-omnipotent
+.align	32
+__mulq_mont_383_nonred:
+	mov	%rax, $bi
+	mulq	@acc[6]			# a[0]*b[0]
+	mov	%rax, @acc[0]
+	mov	$bi, %rax
+	mov	%rdx, @acc[1]
+
+	mulq	@acc[7]			# a[1]*b[0]
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[2]
+
+	mulq	@acc[4]			# a[2]*b[0]
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[3]
+
+	 mov	@acc[0], @acc[7]
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	@acc[5]			# a[3]*b[0]
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[5]
+
+	mulq	8*5($a_ptr)
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+for (my $i=0; $i<6;) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[7]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[7]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*2($n_ptr)
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*3($n_ptr)
+	add	@acc[7], @acc[3]
+	adc	\$0, %rdx
+	add	%rax, @acc[3]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*4($n_ptr)
+	add	%rax, @acc[4]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[7]
+
+	mulq	8*5($n_ptr)
+	add	%rax, @acc[5]
+	mov	$b_next, %rax
+	adc	\$0, %rdx
+	add	@acc[7], @acc[5]
+	adc	%rdx, @acc[6]
+___
+    push(@acc,shift(@acc));
+$code.=<<___	if ($i++<5);
+	################################# Multiply by b[$i]
+	mov	%rax, $bi
+	mulq	8*0($a_ptr)
+	add	%rax, @acc[0]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*1($a_ptr)
+	add	%rax, @acc[1]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*2($a_ptr)
+	add	%rax, @acc[2]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	 mov	@acc[0], @acc[7]
+	 imulq	8(%rsp), @acc[0]
+
+	mulq	8*3($a_ptr)
+	add	%rax, @acc[3]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*4($a_ptr)
+	add	%rax, @acc[4]
+	mov	$bi, %rax
+	adc	\$0, %rdx
+	add	@acc[6], @acc[4]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+
+	mulq	8*5($a_ptr)
+	add	@acc[6], @acc[5]
+	adc	\$0, %rdx
+	add	%rax, @acc[5]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	mov	%rdx, @acc[6]
+___
+}
+$code.=<<___;
+	ret
+.size	__mulq_mont_383_nonred,.-__mulq_mont_383_nonred
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
+
+# omitting 3 reductions gives 8-11% better performance in add-chains
+$code.=<<___;
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,\@function,4,"unwind"
+.align	32
+sqr_mont_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$a_ptr, 8*2(%rsp)
+	mov	$r_ptr, 8*3(%rsp)
+
+	#################################
+	mov	8*0($a_ptr), @acc[0]	# a->re
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$r_ptr, $r_ptr		# borrow flag as mask
+
+	mov	@acc[0], 32+8*0(%rsp)	# t0
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	mov	@acc[6], 32+8*6(%rsp)	# t1
+	mov	@acc[7], 32+8*7(%rsp)
+	mov	@acc[8], 32+8*8(%rsp)
+	mov	@acc[9], 32+8*9(%rsp)
+	mov	@acc[10], 32+8*10(%rsp)
+	mov	@acc[11], 32+8*11(%rsp)
+	mov	$r_ptr,   32+8*12(%rsp)
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	#mov	8*2(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rax	# a->im
+	mov	8*0($a_ptr), @acc[6]	# a->re
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[4]
+	mov	8*3($a_ptr), @acc[5]
+
+	mov	8*3(%rsp), $r_ptr
+	call	__mulq_mont_383_nonred
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+
+	mov	@acc[0],  8*6($r_ptr)	# ret->im
+	mov	@acc[1],  8*7($r_ptr)
+	mov	@acc[2],  8*8($r_ptr)
+	mov	@acc[3],  8*9($r_ptr)
+	mov	@acc[4],  8*10($r_ptr)
+	mov	@acc[5],  8*11($r_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+
+	mov	32+8*6(%rsp), %rax	# t1[0]
+	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
+	mov	32+8*1(%rsp), @acc[7]
+	mov	32+8*2(%rsp), @acc[4]
+	mov	32+8*3(%rsp), @acc[5]
+
+	call	__mulq_mont_383_nonred
+___
+{
+my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
+                     12,13,"ax","bx","bp","si");
+$code.=<<___;
+	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
+	mov	32+8*0(%rsp), @acc[6]
+	mov	32+8*1(%rsp), @acc[7]
+	and	@acc[11], @acc[6]
+	mov	32+8*2(%rsp), @acc[8]
+	and	@acc[11], @acc[7]
+	mov	32+8*3(%rsp), @acc[9]
+	and	@acc[11], @acc[8]
+	mov	32+8*4(%rsp), @acc[10]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	32+8*5(%rsp), @acc[11]
+
+	sub	@acc[6], @acc[0]
+	mov	8*0($n_ptr), @acc[6]
+	sbb	@acc[7], @acc[1]
+	mov	8*1($n_ptr), @acc[7]
+	sbb	@acc[8], @acc[2]
+	mov	8*2($n_ptr), @acc[8]
+	sbb	@acc[9], @acc[3]
+	mov	8*3($n_ptr), @acc[9]
+	sbb	@acc[10], @acc[4]
+	mov	8*4($n_ptr), @acc[10]
+	sbb	@acc[11], @acc[5]
+	sbb	@acc[11], @acc[11]
+
+	and	@acc[11], @acc[6]
+	and	@acc[11], @acc[7]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	adc	@acc[8], @acc[2]
+	adc	@acc[9], @acc[3]
+	adc	@acc[10], @acc[4]
+	adc	@acc[11], @acc[5]
+
+	mov	@acc[0],  8*0($r_ptr)	# ret->re
+	mov	@acc[1],  8*1($r_ptr)
+	mov	@acc[2],  8*2($r_ptr)
+	mov	@acc[3],  8*3($r_ptr)
+	mov	@acc[4],  8*4($r_ptr)
+	mov	@acc[5],  8*5($r_ptr)
+___
+}
+$code.=<<___;
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqr_mont_382x,.-sqr_mont_382x
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl
new file mode 100755
index 00000000000..0d6bf2e465c
--- /dev/null
+++ b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl
@@ -0,0 +1,486 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# "Sparse" in subroutine names refers to most significant limb of the
+# modulus. Though "sparse" is a bit of misnomer, because limitation is
+# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
+# In general Montgomery multiplication algorithm can handle one of the
+# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
+# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
+# *taylored* implementation of the algorithm, can handle such input can
+# be circumstantial. For example, in most general case it depends on
+# similar "bit sparsity" of individual limbs of the second, fully reduced
+# multiplicand. If you can't make such assumption about the limbs, then
+# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
+# This requirement can be met by conditionally subtracting "bitwise
+# left-aligned" modulus. For example, if modulus is 200 bits wide, you
+# would need to conditionally subtract the value of modulus<<56. Common
+# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
+# Well, more specifically ones with upper half not smaller than modulus.
+# Just in case, why limitation at all and not general-purpose 256-bit
+# subroutines? Unlike the 384-bit case, accounting for additional carry
+# has disproportionate impact on performance, especially in adcx/adox
+# implementation.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+{ ############################################################## 255 bits
+my @acc=map("%r$_",(10..15));
+
+{ ############################################################## mulq
+my ($lo,$hi)=("%rbp","%r9");
+
+$code.=<<___;
+.text
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,\@function,5,"unwind"
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), @acc[4]
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	@acc[4], %rax, @acc[1]	# a[0]*b[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,\@function,4,"unwind"
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8,%rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $b_ptr
+	mov	$n_ptr, $n0
+	mov	$b_org, $n_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[5]
+	mov	8*2($a_ptr), $lo
+	mov	8*3($a_ptr), $hi
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%rdx, %rax, @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_sparse_256
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+___
+{
+my @acc=@acc;
+$code.=<<___;
+.type	__mulx_mont_sparse_256,\@abi-omnipotent
+.align	32
+__mulx_mont_sparse_256:
+	mulx	@acc[5], @acc[5], @acc[2]
+	mulx	$lo, $lo, @acc[3]
+	add	@acc[5], @acc[1]
+	mulx	$hi, $hi, @acc[4]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[2]
+	adc	$hi, @acc[3]
+	adc	\$0, @acc[4]
+
+___
+for (my $i=1; $i<4; $i++) {
+my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
+my $a5 = $i==1 ? @acc[5] : $lo;
+$code.=<<___;
+	 mov	%rax, @acc[0]
+	 imulq	$n0, %rax
+
+	################################# Multiply by b[$i]
+	xor	$a5, $a5		# [@acc[5]=0,] cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	 mov	%rax, %rdx
+	adox	$lo, @acc[4]
+	adcx	@acc[5], $hi 		# cf=0
+	adox	$hi, @acc[5]		# of=0
+
+	################################# reduction
+	mulx	8*0+128($n_ptr), $lo, %rax
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	@acc[1], %rax
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, %rax		# @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adcx	@acc[0], @acc[5]
+	adox	@acc[0], @acc[0]	# acc[5] in next iteration
+	adc	\$0, @acc[0]		# cf=0, of=0
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	$n0, %rdx
+
+	################################# last reduction
+	xor	$lo, $lo		# cf=0, of=0
+	mulx	8*0+128($n_ptr), @acc[0], $hi
+	adcx	%rax, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	 mov	@acc[1], %rdx
+	 lea	128($n_ptr), $n_ptr
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	 mov	@acc[2], %rax
+	adcx	@acc[0], @acc[4]
+	adox	@acc[0], @acc[5]
+	adc	\$0, @acc[5]
+
+	#################################
+	# Branch-less conditional acc[1:5] - modulus
+
+	 mov	@acc[3], $lo
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	sbb	8*2($n_ptr), @acc[3]
+	 mov	@acc[4], $hi
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	\$0, @acc[5]
+
+	cmovc	%rdx, @acc[1]
+	cmovc	%rax, @acc[2]
+	cmovc	$lo,  @acc[3]
+	mov	@acc[1], 8*0($r_ptr)
+	cmovc	$hi,  @acc[4]
+	mov	@acc[2], 8*1($r_ptr)
+	mov	@acc[3], 8*2($r_ptr)
+	mov	@acc[4], 8*3($r_ptr)
+
+	ret
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+___
+} }
+{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
+
+$code.=<<___;
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,\@function,4,"unwind"
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	#################################
+	# Branch-less conditional acc[0:3] - modulus
+
+	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
+	mov	@acc[5], %rdx
+	mov	@acc[0], @acc[2]
+	mov	@acc[1], @acc[3]
+
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+
+	cmovnc	@acc[4], %rax
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,\@function,4,"unwind"
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_256
+
+	add	8*4($a_ptr), @acc[4]	# accumulate upper half
+	adc	8*5($a_ptr), @acc[5]
+	mov	@acc[4], %rax
+	adc	8*6($a_ptr), @acc[0]
+	mov	@acc[5], %rdx
+	adc	8*7($a_ptr), @acc[1]
+	sbb	$a_ptr, $a_ptr
+
+	#################################
+	# Branch-less conditional acc[0:4] - modulus
+
+	mov	@acc[0], @acc[2]
+	sub	8*0($n_ptr), @acc[4]
+	sbb	8*1($n_ptr), @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	mov	@acc[1], @acc[3]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	\$0, $a_ptr
+
+	cmovnc	@acc[4], %rax 
+	cmovnc	@acc[5], %rdx
+	cmovnc	@acc[0], @acc[2]
+	mov	%rax,    8*0($r_ptr)
+	cmovnc	@acc[1], @acc[3]
+	mov	%rdx,    8*1($r_ptr)
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redcx_mont_256,.-redcx_mont_256
+___
+{
+my @acc=@acc;
+
+$code.=<<___;
+.type	__mulx_by_1_mont_256,\@abi-omnipotent
+.align	32
+__mulx_by_1_mont_256:
+	mov	8*0($a_ptr), %rax
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+
+	mov	%rax, @acc[4]
+	imulq	$n0, %rax
+	mov	%rax, @acc[0]
+___
+for (my $i=0; $i<4; $i++) {
+my $hi = @acc[4];
+$code.=<<___;
+	################################# reduction $i
+	mulq	8*0($n_ptr)
+	add	%rax, @acc[4]		# guaranteed to be zero
+	mov	@acc[0], %rax
+	adc	%rdx, @acc[4]
+
+	mulq	8*1($n_ptr)
+	add	%rax, @acc[1]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	@acc[4], @acc[1]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*2($n_ptr)
+___
+$code.=<<___	if ($i<3);
+	 mov	@acc[1], @acc[5]
+	 imulq	$n0, @acc[1]
+___
+$code.=<<___;
+	add	%rax, @acc[2]
+	mov	@acc[0], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[2]
+	adc	\$0, %rdx
+	mov	%rdx, $hi
+
+	mulq	8*3($n_ptr)
+	add	%rax, @acc[3]
+	mov	@acc[1], %rax
+	adc	\$0, %rdx
+	add	$hi, @acc[3]
+	adc	\$0, %rdx
+	mov	%rdx, @acc[4]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+___
+} } }
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/mulx_mont_384-x86_64.pl b/crypto/blst_src/asm/mulx_mont_384-x86_64.pl
new file mode 100755
index 00000000000..a7628072e17
--- /dev/null
+++ b/crypto/blst_src/asm/mulx_mont_384-x86_64.pl
@@ -0,0 +1,2384 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+# common argument layout
+($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$b_ptr = "%rbx";
+
+# common accumulator layout
+@acc=map("%r$_",(8..15));
+
+########################################################################
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.text
+
+########################################################################
+# Double-width subtraction modulo n<<384, as opposite to naively
+# expected modulo n*n. It works because n<<384 is the actual
+# input boundary condition for Montgomery reduction, not n*n.
+# Just in case, this is duplicated, but only one module is
+# supposed to be linked...
+.type	__sub_mod_384x384,\@abi-omnipotent
+.align	32
+__sub_mod_384x384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	mov	8*6($a_ptr), @acc[6]
+
+	sub	8*0($b_org), @acc[0]
+	mov	8*7($a_ptr), @acc[7]
+	sbb	8*1($b_org), @acc[1]
+	mov	8*8($a_ptr), @acc[8]
+	sbb	8*2($b_org), @acc[2]
+	mov	8*9($a_ptr), @acc[9]
+	sbb	8*3($b_org), @acc[3]
+	mov	8*10($a_ptr), @acc[10]
+	sbb	8*4($b_org), @acc[4]
+	mov	8*11($a_ptr), @acc[11]
+	sbb	8*5($b_org), @acc[5]
+	 mov	@acc[0], 8*0($r_ptr)
+	sbb	8*6($b_org), @acc[6]
+	 mov	8*0($n_ptr), @acc[0]
+	 mov	@acc[1], 8*1($r_ptr)
+	sbb	8*7($b_org), @acc[7]
+	 mov	8*1($n_ptr), @acc[1]
+	 mov	@acc[2], 8*2($r_ptr)
+	sbb	8*8($b_org), @acc[8]
+	 mov	8*2($n_ptr), @acc[2]
+	 mov	@acc[3], 8*3($r_ptr)
+	sbb	8*9($b_org), @acc[9]
+	 mov	8*3($n_ptr), @acc[3]
+	 mov	@acc[4], 8*4($r_ptr)
+	sbb	8*10($b_org), @acc[10]
+	 mov	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], 8*5($r_ptr)
+	sbb	8*11($b_org), @acc[11]
+	 mov	8*5($n_ptr), @acc[5]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[0]
+	and	$b_org, @acc[1]
+	and	$b_org, @acc[2]
+	and	$b_org, @acc[3]
+	and	$b_org, @acc[4]
+	and	$b_org, @acc[5]
+
+	add	@acc[0], @acc[6]
+	adc	@acc[1], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[2], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[3], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[4], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[5], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,\@abi-omnipotent
+.align	32
+__add_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*0($b_org), @acc[0]
+	adc	8*1($b_org), @acc[1]
+	adc	8*2($b_org), @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	8*3($b_org), @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	8*4($b_org), @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	8*5($b_org), @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$b_org, $b_org
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $b_org
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,\@abi-omnipotent
+.align	32
+__sub_mod_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+__sub_mod_384_a_is_loaded:
+	sub	8*0($b_org), @acc[0]
+	 mov	8*0($n_ptr), @acc[6]
+	sbb	8*1($b_org), @acc[1]
+	 mov	8*1($n_ptr), @acc[7]
+	sbb	8*2($b_org), @acc[2]
+	 mov	8*2($n_ptr), @acc[8]
+	sbb	8*3($b_org), @acc[3]
+	 mov	8*3($n_ptr), @acc[9]
+	sbb	8*4($b_org), @acc[4]
+	 mov	8*4($n_ptr), @acc[10]
+	sbb	8*5($b_org), @acc[5]
+	 mov	8*5($n_ptr), @acc[11]
+	sbb	$b_org, $b_org
+
+	and	$b_org, @acc[6]
+	and	$b_org, @acc[7]
+	and	$b_org, @acc[8]
+	and	$b_org, @acc[9]
+	and	$b_org, @acc[10]
+	and	$b_org, @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[8], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[9], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[10], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[11], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+___
+}
+
+########################################################################
+# "Complex" multiplication and squaring. Use vanilla multiplication when
+# possible to fold reductions. I.e. instead of mul_mont, mul_mont
+# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
+# followed by *common* reduction... For single multiplication disjoint
+# reduction is bad for performance for given vector length, yet overall
+# it's a win here, because it's one reduction less.
+{ my $frame = 5*8 +	# place for argument off-load +
+	      3*768/8;	# place for 3 768-bit temporary vectors
+$code.=<<___;
+.globl	mulx_mont_384x
+.hidden	mulx_mont_384x
+.type	mulx_mont_384x,\@function,5,"unwind"
+.align	32
+mulx_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr
+	mov	$r_ptr, 8*4(%rsp)	# offload arguments
+	mov	$a_ptr, 8*3(%rsp)
+	mov	$b_org, 8*2(%rsp)
+	mov	$n_ptr, 8*1(%rsp)
+	mov	$n0,    8*0(%rsp)
+
+	################################# mul_384(t0, a->re, b->re);
+	#lea	0($b_btr), $b_ptr	# b->re
+	#lea	0($a_ptr), $a_ptr	# a->re
+	lea	40(%rsp), $r_ptr	# t0
+	call	__mulx_384
+
+	################################# mul_384(t1, a->im, b->im);
+	lea	48($b_ptr), $b_ptr	# b->im
+	lea	128+48($a_ptr), $a_ptr	# a->im
+	lea	96($r_ptr), $r_ptr	# t1
+	call	__mulx_384
+
+	################################# mul_384(t2, a->re+a->im, b->re+b->im);
+	mov	8*1(%rsp), $n_ptr
+	lea	($b_ptr), $a_ptr	# b->re
+	lea	-48($b_ptr), $b_org	# b->im
+	lea	40+192+48(%rsp), $r_ptr
+	call	__add_mod_384
+
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	-48($r_ptr), $r_ptr
+	call	__add_mod_384
+
+	lea	($r_ptr),$b_ptr
+	lea	48($r_ptr),$a_ptr
+	call	__mulx_384
+
+	################################# t2=t2-t0-t1
+	lea	($r_ptr), $a_ptr	# t2
+	lea	40(%rsp), $b_org	# t0
+	mov	8*1(%rsp), $n_ptr
+	call	__sub_mod_384x384	# t2-t0
+
+	lea	($r_ptr), $a_ptr	# t2
+	lea	-96($r_ptr), $b_org	# t1
+	call	__sub_mod_384x384	# t2-t0-t1
+
+	################################# t0=t0-t1
+	lea	40(%rsp), $a_ptr
+	lea	40+96(%rsp), $b_org
+	lea	40(%rsp), $r_ptr
+	call	__sub_mod_384x384	# t0-t1
+
+	lea	($n_ptr), $b_ptr	# n_ptr for redc_mont_384
+
+	################################# redc_mont_384(ret->re, t0, mod, n0);
+	lea	40(%rsp), $a_ptr	# t0
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	mov	8*4(%rsp), $r_ptr	# ret->re
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	################################# redc_mont_384(ret->im, t2, mod, n0);
+	lea	40+192(%rsp), $a_ptr	# t2
+	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
+	lea	48($r_ptr), $r_ptr	# ret->im
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_384x,.-mulx_mont_384x
+___
+}
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# alignment
+$code.=<<___;
+.globl	sqrx_mont_384x
+.hidden	sqrx_mont_384x
+.type	sqrx_mont_384x,\@function,4,"unwind"
+.align	32
+sqrx_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+					# gap for __mulx_mont_384
+	mov	$r_ptr, 8*2(%rsp)
+	mov	$a_ptr, 8*3(%rsp)
+
+	################################# add_mod_384(t0, a->re, a->im);
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32(%rsp), $r_ptr	# t0
+	call	__add_mod_384
+
+	################################# sub_mod_384(t1, a->re, a->im);
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_org	# a->im
+	lea	32+48(%rsp), $r_ptr	# t1
+	call	__sub_mod_384
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rdx
+	mov	8*0($a_ptr), %r14	# @acc[6]
+	mov	8*1($a_ptr), %r15	# @acc[7]
+	mov	8*2($a_ptr), %rax	# @acc[8]
+	mov	8*3($a_ptr), %r12	# @acc[4]
+	mov	8*4($a_ptr), %rdi	# $lo
+	mov	8*5($a_ptr), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_384
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	 mov	@acc[0], @acc[6]
+	adc	@acc[3], @acc[3]
+	 mov	@acc[1], @acc[7]
+	adc	@acc[4], @acc[4]
+	 mov	@acc[2], @acc[8]
+	adc	@acc[5], @acc[5]
+	 mov	@acc[3], @acc[9]
+	sbb	$a_ptr, $a_ptr
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	 mov	@acc[4], @acc[10]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	 mov	@acc[5], @acc[11]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $a_ptr
+
+	cmovc	@acc[6],  @acc[0]
+	cmovc	@acc[7],  @acc[1]
+	cmovc	@acc[8],  @acc[2]
+	mov	@acc[0], 8*6($b_ptr)	# ret->im
+	cmovc	@acc[9],  @acc[3]
+	mov	@acc[1], 8*7($b_ptr)
+	cmovc	@acc[10], @acc[4]
+	mov	@acc[2], 8*8($b_ptr)
+	cmovc	@acc[11], @acc[5]
+	mov	@acc[3], 8*9($b_ptr)
+	mov	@acc[4], 8*10($b_ptr)
+	mov	@acc[5], 8*11($b_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32(%rsp), $a_ptr	# t0
+	lea	32+48(%rsp), $b_ptr	# t1
+
+	mov	32+48(%rsp), %rdx	# t1[0]
+	mov	32+8*0(%rsp), %r14	# @acc[6]
+	mov	32+8*1(%rsp), %r15	# @acc[7]
+	mov	32+8*2(%rsp), %rax	# @acc[8]
+	mov	32+8*3(%rsp), %r12	# @acc[4]
+	mov	32+8*4(%rsp), %rdi	# $lo
+	mov	32+8*5(%rsp), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_384x,.-sqrx_mont_384x
+
+.globl	mulx_382x
+.hidden	mulx_382x
+.type	mulx_382x,\@function,4,"unwind"
+.align	32
+mulx_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	lea	96($r_ptr), $r_ptr	# ret->im
+	mov	$a_ptr, 8*0(%rsp)
+	mov	$b_org, 8*1(%rsp)
+	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
+	mov	$n_ptr, 8*3(%rsp)
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[0]
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	add	8*6($a_ptr), @acc[0]
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+
+	mov	@acc[0], 32+8*0(%rsp)
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	################################# t1 = b->re + b->im
+	mov	8*0($b_org), @acc[0]
+	mov	8*1($b_org), @acc[1]
+	mov	8*2($b_org), @acc[2]
+	mov	8*3($b_org), @acc[3]
+	mov	8*4($b_org), @acc[4]
+	mov	8*5($b_org), @acc[5]
+
+	add	8*6($b_org), @acc[0]
+	adc	8*7($b_org), @acc[1]
+	adc	8*8($b_org), @acc[2]
+	adc	8*9($b_org), @acc[3]
+	adc	8*10($b_org), @acc[4]
+	adc	8*11($b_org), @acc[5]
+
+	mov	@acc[0], 32+8*6(%rsp)
+	mov	@acc[1], 32+8*7(%rsp)
+	mov	@acc[2], 32+8*8(%rsp)
+	mov	@acc[3], 32+8*9(%rsp)
+	mov	@acc[4], 32+8*10(%rsp)
+	mov	@acc[5], 32+8*11(%rsp)
+
+	################################# mul_384(ret->im, t0, t1);
+	lea	32+8*0(%rsp), $a_ptr	# t0
+	lea	32+8*6(%rsp), $b_ptr	# t1
+	call	__mulx_384
+
+	################################# mul_384(ret->re, a->re, b->re);
+	mov	8*0(%rsp), $a_ptr
+	mov	8*1(%rsp), $b_ptr
+	lea	-96($r_ptr), $r_ptr	# ret->re
+	call	__mulx_384
+
+	################################# mul_384(tx, a->im, b->im);
+	lea	48+128($a_ptr), $a_ptr
+	lea	48($b_ptr), $b_ptr
+	lea	32(%rsp), $r_ptr
+	call	__mulx_384
+
+	################################# ret->im -= tx
+	mov	8*2(%rsp), $a_ptr	# restore ret->im
+	lea	32(%rsp), $b_org
+	mov	8*3(%rsp), $n_ptr
+	mov	$a_ptr, $r_ptr
+	call	__sub_mod_384x384
+
+	################################# ret->im -= ret->re
+	lea	0($r_ptr), $a_ptr
+	lea	-96($r_ptr), $b_org
+	call	__sub_mod_384x384
+
+	################################# ret->re -= tx
+	lea	-96($r_ptr), $a_ptr
+	lea	32(%rsp), $b_org
+	lea	-96($r_ptr), $r_ptr
+	call	__sub_mod_384x384
+
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_382x,.-mulx_382x
+___
+}
+{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
+						# except for $n_ptr and $r_ptr
+$code.=<<___;
+.globl	sqrx_382x
+.hidden	sqrx_382x
+.type	sqrx_382x,\@function,3,"unwind"
+.align	32
+sqrx_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$a_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+
+	################################# t0 = a->re + a->im
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+	mov	8*5($a_ptr), @acc[11]
+
+	mov	@acc[6], @acc[0]
+	add	8*6($a_ptr), @acc[6]
+	mov	@acc[7], @acc[1]
+	adc	8*7($a_ptr), @acc[7]
+	mov	@acc[8], @acc[2]
+	adc	8*8($a_ptr), @acc[8]
+	mov	@acc[9], @acc[3]
+	adc	8*9($a_ptr), @acc[9]
+	mov	@acc[10], @acc[4]
+	adc	8*10($a_ptr), @acc[10]
+	mov	@acc[11], @acc[5]
+	adc	8*11($a_ptr), @acc[11]
+
+	mov	@acc[6], 8*0($r_ptr)
+	mov	@acc[7], 8*1($r_ptr)
+	mov	@acc[8], 8*2($r_ptr)
+	mov	@acc[9], 8*3($r_ptr)
+	mov	@acc[10], 8*4($r_ptr)
+	mov	@acc[11], 8*5($r_ptr)
+
+	################################# t1 = a->re - a->im
+	lea	48($a_ptr), $b_org
+	lea	48($r_ptr), $r_ptr
+	call	__sub_mod_384_a_is_loaded
+
+	################################# mul_384(ret->re, t0, t1);
+	lea	($r_ptr), $a_ptr
+	lea	-48($r_ptr), $b_ptr
+	lea	-48($r_ptr), $r_ptr
+	call	__mulx_384
+
+	################################# mul_384(ret->im, a->re, a->im);
+	mov	(%rsp), $a_ptr
+	lea	48($a_ptr), $b_ptr
+	lea	96($r_ptr), $r_ptr
+	call	__mulx_384
+
+	mov	8*0($r_ptr), @acc[0]	# double ret->im
+	mov	8*1($r_ptr), @acc[1]
+	mov	8*2($r_ptr), @acc[2]
+	mov	8*3($r_ptr), @acc[3]
+	mov	8*4($r_ptr), @acc[4]
+	mov	8*5($r_ptr), @acc[5]
+	mov	8*6($r_ptr), @acc[6]
+	mov	8*7($r_ptr), @acc[7]
+	mov	8*8($r_ptr), @acc[8]
+	mov	8*9($r_ptr), @acc[9]
+	mov	8*10($r_ptr), @acc[10]
+	add	@acc[0], @acc[0]
+	mov	8*11($r_ptr), @acc[11]
+	adc	@acc[1], @acc[1]
+	mov	@acc[0], 8*0($r_ptr)
+	adc	@acc[2], @acc[2]
+	mov	@acc[1], 8*1($r_ptr)
+	adc	@acc[3], @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	adc	@acc[4], @acc[4]
+	mov	@acc[3], 8*3($r_ptr)
+	adc	@acc[5], @acc[5]
+	mov	@acc[4], 8*4($r_ptr)
+	adc	@acc[6], @acc[6]
+	mov	@acc[5], 8*5($r_ptr)
+	adc	@acc[7], @acc[7]
+	mov	@acc[6], 8*6($r_ptr)
+	adc	@acc[8], @acc[8]
+	mov	@acc[7], 8*7($r_ptr)
+	adc	@acc[9], @acc[9]
+	mov	@acc[8], 8*8($r_ptr)
+	adc	@acc[10], @acc[10]
+	mov	@acc[9], 8*9($r_ptr)
+	adc	@acc[11], @acc[11]
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	mov	8*1(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*2(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*3(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*4(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*5(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*6(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*7(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_382x,.-sqrx_382x
+___
+}
+{ ########################################################## 384-bit mulx
+my ($a0, $a1) = @acc[6..7];
+my @acc = @acc[0..5];
+my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp");
+
+$code.=<<___;
+.globl	mulx_384
+.hidden	mulx_384
+.type	mulx_384,\@function,3,"unwind"
+.align	32
+mulx_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	call	__mulx_384
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_384,.-mulx_384
+
+.type	__mulx_384,\@abi-omnipotent
+.align	32
+__mulx_384:
+	mov	8*0($b_ptr), %rdx
+	mov	8*0($a_ptr), $a0
+	mov	8*1($a_ptr), $a1
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+	lea	-128($a_ptr), $a_ptr
+
+	mulx	$a0, @acc[1], $hi
+	xor	$zr, $zr
+
+	mulx	$a1, @acc[0], $lo
+	adcx	$hi, @acc[0]
+	mov	@acc[1], 8*0($r_ptr)
+
+	mulx	@acc[2], @acc[1], $hi
+	adcx	$lo, @acc[1]
+
+	mulx	@acc[3], @acc[2], $lo
+	adcx	$hi, @acc[2]
+
+	mulx	@acc[4], @acc[3], $hi
+	adcx	$lo, @acc[3]
+
+	mulx	@acc[5], @acc[4], @acc[5]
+	mov	8*1($b_ptr), %rdx
+	adcx	$hi, @acc[4]
+	adcx	$zr, @acc[5]
+___
+for(my $i=1; $i<6; $i++) {
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
+$code.=<<___;
+	mulx	$a0, $lo, $hi
+	adcx	@acc[0], $lo
+	adox	$hi, @acc[1]
+	mov	$lo, 8*$i($r_ptr)
+
+	mulx	$a1, @acc[0], $hi
+	adcx	@acc[1], $acc[0]
+	adox	$hi, @acc[2]
+
+	mulx	128+8*2($a_ptr), @acc[1], $lo
+	adcx	@acc[2], @acc[1]
+	adox	$lo, @acc[3]
+
+	mulx	128+8*3($a_ptr), @acc[2], $hi
+	adcx	@acc[3], @acc[2]
+	adox	$hi, @acc[4]
+
+	mulx	128+8*4($a_ptr), @acc[3], $lo
+	adcx	@acc[4], @acc[3]
+	adox	@acc[5], $lo
+
+	mulx	128+8*5($a_ptr), @acc[4], @acc[5]
+	mov	$b_next, %rdx
+	adcx	$lo, @acc[4]
+	adox	$zr, @acc[5]
+	adcx	$zr, @acc[5]
+___
+}
+$code.=<<___;
+	mov	@acc[0], 8*6($r_ptr)
+	mov	@acc[1], 8*7($r_ptr)
+	mov	@acc[2], 8*8($r_ptr)
+	mov	@acc[3], 8*9($r_ptr)
+	mov	@acc[4], 8*10($r_ptr)
+	mov	@acc[5], 8*11($r_ptr)
+
+	ret
+.size	__mulx_384,.-__mulx_384
+___
+}
+{ ########################################################## 384-bit sqrx
+$code.=<<___;
+.globl	sqrx_384
+.hidden	sqrx_384
+.type	sqrx_384,\@function,2,"unwind"
+.align	32
+sqrx_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	push	$r_ptr
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	call	__sqrx_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_384,.-sqrx_384
+___
+if (0) {
+# up to 5% slower than below variant
+my @acc=map("%r$_",("no",8..15,"cx","bx"));
+   push(@acc, $a_ptr);
+my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno");
+
+$code.=<<___;
+.type	__sqrx_384,\@abi-omnipotent
+.align	32
+__sqrx_384:
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+
+	#########################################
+	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
+	 mov	8*5($a_ptr), @acc[11]
+	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
+	add	$lo, @acc[2]
+	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
+	adc	$hi, @acc[3]
+	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
+	adc	$lo, @acc[4]
+	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+
+	mulx	%rdx, $lo, $hi			# a[0]*a[0]
+	 mov	@acc[7], %rdx
+	xor	@acc[7], @acc[7]
+	add	@acc[1], @acc[1]		# double acc[1]
+	adc	\$0, @acc[7]
+	add	$hi, @acc[1]
+	adc	\$0, @acc[7]
+	mov	$lo, 8*0($r_ptr)
+	mov	@acc[1], 8*1($r_ptr)
+___
+($carry, @acc[7]) = (@acc[7], @acc[1]);
+$code.=<<___;
+	#########################################
+	xor	@acc[7], @acc[7]
+	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
+	adcx	$lo, @acc[6]
+	adox	@acc[7], $hi
+	adcx	$hi, @acc[7]
+
+	mulx	%rdx, $lo, $hi			# a[1]*a[1]
+	 mov	@acc[8], %rdx
+	xor	@acc[8], @acc[8]
+	adox	@acc[2], @acc[2]		# double acc[2:3]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[3], @acc[3]
+	adcx	$lo, @acc[2]
+	adox	@acc[8], @acc[8]
+	adcx	$hi, @acc[3]
+	adc	\$0, @acc[8]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+___
+($carry,@acc[8])=(@acc[8],$carry);
+$code.=<<___;
+	#########################################
+	xor	@acc[8], @acc[8]
+	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
+	adcx	$lo, @acc[6]
+	adox	$hi, @acc[7]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
+	adcx	$lo, @acc[7]
+	adox	@acc[8], $hi
+	adcx	$hi, @acc[8]
+
+	mulx	%rdx, $lo, $hi			# a[2]*a[2]
+	 mov	@acc[9], %rdx
+	xor	@acc[9], @acc[9]
+	adox	@acc[4], @acc[4]		# double acc[4:5]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[5], @acc[5]
+	adcx	$lo, @acc[4]
+	adox	@acc[9], @acc[9]
+	adcx	$hi, @acc[5]
+	adc	\$0, $acc[9]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+___
+($carry,@acc[9])=(@acc[9],$carry);
+$code.=<<___;
+	#########################################
+	xor	@acc[9], @acc[9]
+	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
+	adcx	$lo, @acc[7]
+	adox	$hi, @acc[8]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
+	adcx	$lo, @acc[8]
+	adox	@acc[9], $hi
+	adcx	$hi, @acc[9]
+
+	mulx	%rdx, $lo, $hi
+	 mov	@acc[10], %rdx
+	xor	@acc[10], @acc[10]
+	adox	@acc[6], @acc[6]		# double acc[6:7]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[7], @acc[7]
+	adcx	$lo, @acc[6]
+	adox	@acc[10], @acc[10]
+	adcx	$hi, @acc[7]
+	adc	\$0, $acc[10]
+	mov	@acc[6], 8*6($r_ptr)
+	mov	@acc[7], 8*7($r_ptr)
+___
+($carry,@acc[10])=(@acc[10],$carry);
+$code.=<<___;
+	#########################################
+	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
+	add	$lo, @acc[9]
+	adc	\$0, @acc[10]
+
+	mulx	%rdx, $lo, $hi			# a[4]*a[4]
+	 mov	@acc[11], %rdx
+	xor	@acc[11], @acc[11]
+	adox	@acc[8], @acc[8]		# double acc[8:10]
+	adcx	$carry, $lo			# can't carry
+	adox	@acc[9], @acc[9]
+	adcx	$lo, @acc[8]
+	adox	@acc[10], @acc[10]
+	adcx	$hi, @acc[9]
+	adox	@acc[11], @acc[11]
+	mov	@acc[8], 8*8($r_ptr)
+	mov	@acc[9], 8*9($r_ptr)
+
+	#########################################
+	mulx	%rdx, $lo, $hi			# a[5]*a[5]
+	adcx	$lo, @acc[10]
+	adcx	$hi, @acc[11]
+
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sqrx_384,.-__sqrx_384
+___
+} else {
+my @acc=map("%r$_",("no",8..15,"cx","bx","bp"));
+my ($lo, $hi)=($r_ptr, "%rax");
+
+$code.=<<___;
+.type	__sqrx_384,\@abi-omnipotent
+.align	32
+__sqrx_384:
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[9]
+	mov	8*4($a_ptr), @acc[10]
+
+	#########################################
+	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
+	 mov	8*5($a_ptr), @acc[11]
+	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
+	add	$lo, @acc[2]
+	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
+	adc	$hi, @acc[3]
+	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
+	adc	$lo, @acc[4]
+	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
+	 mov	@acc[7], %rdx
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+
+	#########################################
+	xor	@acc[7], @acc[7]
+	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
+	 mov	@acc[8], %rdx
+	adcx	$lo, @acc[6]
+	adox	@acc[7], $hi
+	adcx	$hi, @acc[7]
+
+	#########################################
+	xor	@acc[8], @acc[8]
+	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+
+	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
+	adcx	$lo, @acc[6]
+	adox	$hi, @acc[7]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
+	 mov	@acc[9], %rdx
+	adcx	$lo, @acc[7]
+	adox	@acc[8], $hi
+	adcx	$hi, @acc[8]
+
+	#########################################
+	xor	@acc[9], @acc[9]
+	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
+	adcx	$lo, @acc[7]
+	adox	$hi, @acc[8]
+
+	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
+	 mov	@acc[10], %rdx
+	adcx	$lo, @acc[8]
+	adox	@acc[9], $hi
+	adcx	$hi, @acc[9]
+
+	#########################################
+	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
+	 mov	8*0($a_ptr), %rdx
+	add	$lo, @acc[9]
+	 mov	8(%rsp), $r_ptr			# restore $r_ptr
+	adc	\$0, @acc[10]
+
+	######################################### double acc[1:10]
+	xor	@acc[11], @acc[11]
+	adcx	@acc[1], @acc[1]
+	adcx	@acc[2], @acc[2]
+	adcx	@acc[3], @acc[3]
+	adcx	@acc[4], @acc[4]
+	adcx	@acc[5], @acc[5]
+
+	######################################### accumulate a[i]*a[i]
+	mulx	%rdx, %rdx, $hi 		# a[0]*a[0]
+	mov	%rdx, 8*0($r_ptr)
+	mov	8*1($a_ptr), %rdx
+	adox	$hi, @acc[1]
+	mov	@acc[1], 8*1($r_ptr)
+
+	mulx	%rdx, @acc[1], $hi		# a[1]*a[1]
+	mov	8*2($a_ptr), %rdx
+	adox	@acc[1], @acc[2]
+	adox	$hi,     @acc[3]
+	mov	@acc[2], 8*2($r_ptr)
+	mov	@acc[3], 8*3($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[2]*a[2]
+	mov	8*3($a_ptr), %rdx
+	adox	@acc[1], @acc[4]
+	adox	@acc[2], @acc[5]
+	adcx	@acc[6], @acc[6]
+	adcx	@acc[7], @acc[7]
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[3]*a[3]
+	mov	8*4($a_ptr), %rdx
+	adox	@acc[1], @acc[6]
+	adox	@acc[2], @acc[7]
+	adcx	@acc[8], @acc[8]
+	adcx	@acc[9], @acc[9]
+	mov	@acc[6], 8*6($r_ptr)
+	mov	@acc[7], 8*7($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[4]*a[4]
+	mov	8*5($a_ptr), %rdx
+	adox	@acc[1], @acc[8]
+	adox	@acc[2], @acc[9]
+	adcx	@acc[10], @acc[10]
+	adcx	@acc[11], @acc[11]
+	mov	@acc[8], 8*8($r_ptr)
+	mov	@acc[9], 8*9($r_ptr)
+
+	mulx	%rdx, @acc[1], @acc[2]		# a[5]*a[5]
+	adox	@acc[1], @acc[10]
+	adox	@acc[2], @acc[11]
+
+	mov	@acc[10], 8*10($r_ptr)
+	mov	@acc[11], 8*11($r_ptr)
+
+	ret
+.size	__sqrx_384,.-__sqrx_384
+___
+}
+
+{ ########################################################## 384-bit redcx_mont
+my ($n_ptr, $n0)=($b_ptr, $n_ptr);      # arguments are "shifted"
+my ($lo, $hi) = ("%rax", "%rbp");
+
+$code.=<<___;
+########################################################################
+# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12],
+#                     uint64_t m[6], uint64_t n0);
+.globl	redcx_mont_384
+.hidden	redcx_mont_384
+.type	redcx_mont_384,\@function,4,"unwind"
+.align	32
+redcx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	redcx_mont_384,.-redcx_mont_384
+
+########################################################################
+# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6],
+#                    uint64_t m[6], uint64_t n0);
+.globl	fromx_mont_384
+.hidden	fromx_mont_384
+.type	fromx_mont_384,\@function,4,"unwind"
+.align	32
+fromx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$b_org, $n_ptr
+	call	__mulx_by_1_mont_384
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[6], %rax
+	mov	@acc[7], %rcx
+	mov	@acc[0], %rdx
+	mov	@acc[1], %rbp
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[7]
+	mov	@acc[2], @acc[5]
+	sbb	8*2($n_ptr), @acc[0]
+	sbb	8*3($n_ptr), @acc[1]
+	sbb	8*4($n_ptr), @acc[2]
+	mov	@acc[3], $a_ptr
+	sbb	8*5($n_ptr), @acc[3]
+
+	cmovc	%rax, @acc[6]
+	cmovc	%rcx, @acc[7]
+	cmovc	%rdx, @acc[0]
+	mov	@acc[6], 8*0($r_ptr)
+	cmovc	%rbp, @acc[1]
+	mov	@acc[7], 8*1($r_ptr)
+	cmovc	@acc[5], @acc[2]
+	mov	@acc[0], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[3]
+	mov	@acc[1], 8*3($r_ptr)
+	mov	@acc[2], 8*4($r_ptr)
+	mov	@acc[3], 8*5($r_ptr)
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	fromx_mont_384,.-fromx_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_by_1_mont_384,\@abi-omnipotent
+.align	32
+__mulx_by_1_mont_384:
+	mov	8*0($a_ptr), @acc[0]
+	mov	$n0, %rdx
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+___
+for (my $i=0; $i<6; $i++) {
+$code.=<<___;
+	imulq	@acc[0], %rdx
+
+	################################# reduction $i
+	xor	@acc[6], @acc[6]	# @acc[6]=0, cf=0, of=0
+	mulx	8*0($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5($n_ptr), $lo, $hi
+	 mov	$n0, %rdx
+	adcx	$lo, @acc[5]
+	adox	@acc[6], $hi
+	adcx	$hi, @acc[6]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	ret
+.size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
+
+.type	__redc_tail_mont_384,\@abi-omnipotent
+.align	32
+__redc_tail_mont_384:
+	add	8*6($a_ptr), @acc[0]	# accumulate upper half
+	mov	@acc[0], %rax
+	adc	8*7($a_ptr), @acc[1]
+	adc	8*8($a_ptr), @acc[2]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[1], %rcx
+	adc	8*10($a_ptr), @acc[4]
+	adc	8*11($a_ptr), @acc[5]
+	sbb	@acc[6], @acc[6]
+
+	#################################
+	# Branch-less conditional acc[0:6] - modulus
+
+	mov	@acc[2], %rdx
+	mov	@acc[3], %rbp
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	mov	@acc[4], @acc[7]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	mov	@acc[5], $a_ptr
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, @acc[6]
+
+	cmovc	%rax, @acc[0]
+	cmovc	%rcx, @acc[1]
+	cmovc	%rdx, @acc[2]
+	mov	@acc[0], 8*0($r_ptr)
+	cmovc	%rbp, @acc[3]
+	mov	@acc[1], 8*1($r_ptr)
+	cmovc	@acc[7], @acc[4]
+	mov	@acc[2], 8*2($r_ptr)
+	cmovc	$a_ptr,  @acc[5]
+	mov	@acc[3], 8*3($r_ptr)
+	mov	@acc[4], 8*4($r_ptr)
+	mov	@acc[5], 8*5($r_ptr)
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0x_pty_mont_384
+.hidden	sgn0x_pty_mont_384
+.type	sgn0x_pty_mont_384,\@function,3,"unwind"
+.align	32
+sgn0x_pty_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	0($r_ptr), $a_ptr
+	mov	$b_org, $n0
+	call	__mulx_by_1_mont_384
+
+	xor	%rax, %rax
+	mov	@acc[0], @acc[7]
+	add	@acc[0], @acc[0]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[0]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	not	%rax			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0x_pty_mont_384,.-sgn0x_pty_mont_384
+
+.globl	sgn0x_pty_mont_384x
+.hidden	sgn0x_pty_mont_384x
+.type	sgn0x_pty_mont_384x,\@function,3,"unwind"
+.align	32
+sgn0x_pty_mont_384x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$8, %rsp
+.cfi_adjust_cfa_offset	8
+.cfi_end_prologue
+
+	mov	$a_ptr, $n_ptr
+	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
+	mov	$b_org, $n0
+	call	__mulx_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
+	xor	$r_ptr, $r_ptr
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, $r_ptr
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, $r_ptr
+
+	mov	@acc[0], 0(%rsp)	# a->im is zero or not
+	not	$r_ptr			# 2*x > p, which means "negative"
+	and	\$1, @acc[7]
+	and	\$2, $r_ptr
+	or	@acc[7], $r_ptr		# pack sign and parity
+
+	call	__mulx_by_1_mont_384
+
+	mov	@acc[0], @acc[6]
+	or	@acc[1], @acc[0]
+	or	@acc[2], @acc[0]
+	or	@acc[3], @acc[0]
+	or	@acc[4], @acc[0]
+	or	@acc[5], @acc[0]
+
+	xor	%rax, %rax
+	mov	@acc[6], @acc[7]
+	add	@acc[6], @acc[6]
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+	adc	\$0, %rax
+
+	sub	8*0($n_ptr), @acc[6]
+	sbb	8*1($n_ptr), @acc[1]
+	sbb	8*2($n_ptr), @acc[2]
+	sbb	8*3($n_ptr), @acc[3]
+	sbb	8*4($n_ptr), @acc[4]
+	sbb	8*5($n_ptr), @acc[5]
+	sbb	\$0, %rax
+
+	mov	0(%rsp), @acc[6]
+
+	not	%rax			# 2*x > p, which means "negative"
+
+	test	@acc[0], @acc[0]
+	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
+
+	test	@acc[6], @acc[6]
+	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	\$1, @acc[7]
+	and	\$2, %rax
+	or	@acc[7], %rax		# pack sign and parity
+
+	mov	8(%rsp),%r15
+.cfi_restore	%r15
+	mov	16(%rsp),%r14
+.cfi_restore	%r14
+	mov	24(%rsp),%r13
+.cfi_restore	%r13
+	mov	32(%rsp),%r12
+.cfi_restore	%r12
+	mov	40(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	48(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x
+___
+} }
+
+{ ########################################################## mulx/sqrx_mont
+my @acc = (@acc, "%rax");
+my ($lo,$hi)=("%rdi","%rbp");
+
+$code.=<<___;
+.globl	mulx_mont_384
+.hidden	mulx_mont_384
+.type	mulx_mont_384,\@function,5,"unwind"
+.align	32
+mulx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*3(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	$b_org, $b_ptr		# evacuate from %rdx
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), @acc[6]
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+	mov	$n0, (%rsp)
+
+	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*3(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*4(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*5(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*6(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*7(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*8(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	mulx_mont_384,.-mulx_mont_384
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_mont_384,\@abi-omnipotent
+.align	32
+__mulx_mont_384:
+.cfi_startproc
+	mulx	@acc[7], @acc[6], @acc[2]
+	mulx	@acc[8], @acc[7], @acc[3]
+	add	@acc[6], @acc[1]
+	mulx	@acc[4], @acc[8], @acc[4]
+	adc	@acc[7], @acc[2]
+	mulx	$lo, $lo, @acc[5]
+	adc	@acc[8], @acc[3]
+	mulx	$hi, $hi, @acc[6]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[4]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+	xor	@acc[7], @acc[7]
+
+___
+for (my $i=1; $i<6; $i++) {
+my $tt = $i==1 ? @acc[7] : $hi;
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	 mov	@acc[0], 16(%rsp)
+	 imulq	8(%rsp), @acc[0]
+
+	################################# Multiply by b[$i]
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[4]
+	adcx	$hi, @acc[5]
+
+	mulx	8*4+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[5]
+	adcx	$hi, @acc[6]
+
+	mulx	8*5+128($a_ptr), $lo, $hi
+	 mov	@acc[0], %rdx
+	adox	$lo, @acc[6]
+	adcx	$hi, @acc[7]		# cf=0
+	adox	@acc[8], @acc[7]
+	adox	@acc[8], @acc[8]
+
+	################################# reduction
+	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	16(%rsp), $lo		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adcx	@acc[0], @acc[6]
+	adox	@acc[0], @acc[7]
+	adcx	@acc[0], @acc[7]
+	adox	@acc[0], @acc[8]
+	adcx	@acc[0], @acc[8]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	8(%rsp), %rdx
+	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
+
+	################################# last reduction
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+	 mov	@acc[2], @acc[0]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+	 mov	@acc[3], $a_ptr
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	 mov	@acc[1], %rdx
+	adcx	@acc[8], @acc[6]
+	adox	@acc[8], @acc[7]
+	 lea	128($n_ptr), $n_ptr
+	 mov	@acc[4], @acc[8]
+	adc	\$0, @acc[7]
+
+	#################################
+	# Branch-less conditional acc[1:7] - modulus
+
+	sub	8*0($n_ptr), @acc[1]
+	sbb	8*1($n_ptr), @acc[2]
+	 mov	@acc[5], $lo
+	sbb	8*2($n_ptr), @acc[3]
+	sbb	8*3($n_ptr), @acc[4]
+	sbb	8*4($n_ptr), @acc[5]
+	 mov	@acc[6], $hi
+	sbb	8*5($n_ptr), @acc[6]
+	sbb	\$0, @acc[7]
+
+	cmovnc	@acc[1], %rdx
+	cmovc	@acc[0], @acc[2]
+	cmovc	$a_ptr, @acc[3]
+	cmovnc	@acc[4], @acc[8]
+	mov	%rdx, 8*0($b_ptr)
+	cmovnc	@acc[5], $lo
+	mov	@acc[2], 8*1($b_ptr)
+	cmovnc	@acc[6], $hi
+	mov	@acc[3], 8*2($b_ptr)
+	mov	@acc[8], 8*3($b_ptr)
+	mov	$lo, 8*4($b_ptr)
+	mov	$hi, 8*5($b_ptr)
+
+	ret
+.cfi_endproc
+.size	__mulx_mont_384,.-__mulx_mont_384
+___
+}
+$code.=<<___;
+.globl	sqrx_mont_384
+.hidden	sqrx_mont_384
+.type	sqrx_mont_384,\@function,4,"unwind"
+.align	32
+sqrx_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*3(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*3
+.cfi_end_prologue
+
+	mov	$n_ptr, $n0		# n0
+	lea	-128($b_org), $n_ptr	# control u-op density
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	lea	($a_ptr), $b_ptr
+	mov	$n0, (%rsp)		# n0
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_384		# as fast as dedicated squaring
+
+	mov	8*3(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*4(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*5(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*6(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*7(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*8(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*9(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_384,.-sqrx_mont_384
+
+.globl	sqrx_n_mul_mont_384
+.hidden	sqrx_n_mul_mont_384
+.type	sqrx_n_mul_mont_384,\@function,6,"unwind"
+.align	32
+sqrx_n_mul_mont_384:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*5(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*5
+.cfi_end_prologue
+
+	mov	$b_org, @acc[2]		# loop counter
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	$a_ptr, $b_ptr
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_384
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	mov	$n0, (%rsp)
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	8*0(%r9), %xmm2		# prefetch b[0]
+
+.Loop_sqrx_384:
+	movd	@acc[2]d, %xmm1
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_384
+
+	movd	%xmm1, @acc[2]d
+	dec	@acc[2]d
+	jnz	.Loop_sqrx_384
+
+	mov	%rdx, @acc[6]
+	movq	%xmm2, %rdx		# b[0]
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*5(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*6(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*7(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*8(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*9(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*10(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384
+
+.globl	sqrx_n_mul_mont_383
+.hidden	sqrx_n_mul_mont_383
+.type	sqrx_n_mul_mont_383,\@function,6,"unwind"
+.align	32
+sqrx_n_mul_mont_383:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	lea	-8*5(%rsp), %rsp
+.cfi_adjust_cfa_offset	8*5
+.cfi_end_prologue
+
+	mov	$b_org, @acc[2]		# loop counter
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), @acc[7]
+	mov	8*2($a_ptr), @acc[8]
+	mov	$a_ptr, $b_ptr
+	mov	8*3($a_ptr), @acc[4]
+	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_383_nonred
+	mov	8*4($a_ptr), $lo
+	mov	8*5($a_ptr), $hi
+
+	mov	$n0, (%rsp)
+	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
+	movq	8*0(%r9), %xmm2		# prefetch b[0]
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+.Loop_sqrx_383:
+	movd	@acc[2]d, %xmm1
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+
+	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
+	call	__mulx_mont_383_nonred	# omitting full reduction gives ~15%
+					# in addition-chains
+	movd	%xmm1, @acc[2]d
+	dec	@acc[2]d
+	jnz	.Loop_sqrx_383
+
+	mov	%rdx, @acc[6]
+	movq	%xmm2, %rdx		# b[0]
+	lea	-128($b_ptr), $a_ptr	# control u-op density
+	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
+
+	mulx	@acc[6], @acc[0], @acc[1]	# a[0]*b[0]
+	call	__mulx_mont_384
+
+	mov	8*5(%rsp),%r15
+.cfi_restore	%r15
+	mov	8*6(%rsp),%r14
+.cfi_restore	%r14
+	mov	8*7(%rsp),%r13
+.cfi_restore	%r13
+	mov	8*8(%rsp),%r12
+.cfi_restore	%r12
+	mov	8*9(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	8*10(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	8*11(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383
+___
+{ my @acc=@acc;				# will be rotated locally
+
+$code.=<<___;
+.type	__mulx_mont_383_nonred,\@abi-omnipotent
+.align	32
+__mulx_mont_383_nonred:
+.cfi_startproc
+	mulx	@acc[7], @acc[6], @acc[2]
+	mulx	@acc[8], @acc[7], @acc[3]
+	add	@acc[6], @acc[1]
+	mulx	@acc[4], @acc[8], @acc[4]
+	adc	@acc[7], @acc[2]
+	mulx	$lo, $lo, @acc[5]
+	adc	@acc[8], @acc[3]
+	mulx	$hi, $hi, @acc[6]
+	 mov	8($b_ptr), %rdx
+	adc	$lo, @acc[4]
+	adc	$hi, @acc[5]
+	adc	\$0, @acc[6]
+___
+for (my $i=1; $i<6; $i++) {
+my $tt = $i==1 ? @acc[7] : $hi;
+my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
+$code.=<<___;
+	 mov	@acc[0], @acc[8]
+	 imulq	8(%rsp), @acc[0]
+
+	################################# Multiply by b[$i]
+	xor	@acc[7], @acc[7]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[1]
+	adcx	$hi, @acc[2]
+
+	mulx	8*1+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[2]
+	adcx	$hi, @acc[3]
+
+	mulx	8*2+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[3]
+	adcx	$hi, @acc[4]
+
+	mulx	8*3+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[4]
+	adcx	$hi, @acc[5]
+
+	mulx	8*4+128($a_ptr), $lo, $hi
+	adox	$lo, @acc[5]
+	adcx	$hi, @acc[6]
+
+	mulx	8*5+128($a_ptr), $lo, $hi
+	 mov	@acc[0], %rdx
+	adox	$lo, @acc[6]
+	adcx	@acc[7], $hi
+	adox	$hi, @acc[7]
+
+	################################# reduction
+	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[8]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	$b_next, %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adcx	@acc[8], @acc[6]
+	adox	@acc[8], @acc[7]
+	adcx	@acc[8], @acc[7]
+___
+    push(@acc,shift(@acc));
+}
+$code.=<<___;
+	imulq	8(%rsp), %rdx
+	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
+
+	################################# last reduction
+	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
+	mulx	8*0+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[0]		# guaranteed to be zero
+	adox	$hi, @acc[1]
+
+	mulx	8*1+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[1]
+	adox	$hi, @acc[2]
+
+	mulx	8*2+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[2]
+	adox	$hi, @acc[3]
+
+	mulx	8*3+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[3]
+	adox	$hi, @acc[4]
+
+	mulx	8*4+128($n_ptr), $lo, $hi
+	adcx	$lo, @acc[4]
+	adox	$hi, @acc[5]
+
+	mulx	8*5+128($n_ptr), $lo, $hi
+	 mov	@acc[1], %rdx
+	adcx	$lo, @acc[5]
+	adox	$hi, @acc[6]
+	adc	\$0, @acc[6]
+	 mov	@acc[4], @acc[8]
+
+	mov	@acc[1], 8*0($b_ptr)
+	mov	@acc[2], 8*1($b_ptr)
+	mov	@acc[3], 8*2($b_ptr)
+	 mov	@acc[5], $lo
+	mov	@acc[4], 8*3($b_ptr)
+	mov	@acc[5], 8*4($b_ptr)
+	mov	@acc[6], 8*5($b_ptr)
+	 mov	@acc[6], $hi
+
+	ret
+.cfi_endproc
+.size	__mulx_mont_383_nonred,.-__mulx_mont_383_nonred
+___
+} } }
+{ my $frame = 4*8 +	# place for argument off-load +
+	      2*384/8 +	# place for 2 384-bit temporary vectors
+	      8;	# align
+my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
+
+# omitting 3 reductions gives ~10% better performance in add-chains
+$code.=<<___;
+.globl	sqrx_mont_382x
+.hidden	sqrx_mont_382x
+.type	sqrx_mont_382x,\@function,4,"unwind"
+.align	32
+sqrx_mont_382x:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	sub	\$$frame, %rsp
+.cfi_adjust_cfa_offset	$frame
+.cfi_end_prologue
+
+	mov	$n_ptr, 8*0(%rsp)	# n0
+	mov	$b_org, $n_ptr		# n_ptr
+	mov	$r_ptr, 8*2(%rsp)
+	mov	$a_ptr, 8*3(%rsp)
+
+	#################################
+	mov	8*0($a_ptr), @acc[0]	# a->re
+	mov	8*1($a_ptr), @acc[1]
+	mov	8*2($a_ptr), @acc[2]
+	mov	8*3($a_ptr), @acc[3]
+	mov	8*4($a_ptr), @acc[4]
+	mov	8*5($a_ptr), @acc[5]
+
+	mov	@acc[0], @acc[6]
+	add	8*6($a_ptr), @acc[0]	# a->re + a->im
+	mov	@acc[1], @acc[7]
+	adc	8*7($a_ptr), @acc[1]
+	mov	@acc[2], @acc[8]
+	adc	8*8($a_ptr), @acc[2]
+	mov	@acc[3], @acc[9]
+	adc	8*9($a_ptr), @acc[3]
+	mov	@acc[4], @acc[10]
+	adc	8*10($a_ptr), @acc[4]
+	mov	@acc[5], @acc[11]
+	adc	8*11($a_ptr), @acc[5]
+
+	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
+	sbb	8*7($a_ptr), @acc[7]
+	sbb	8*8($a_ptr), @acc[8]
+	sbb	8*9($a_ptr), @acc[9]
+	sbb	8*10($a_ptr), @acc[10]
+	sbb	8*11($a_ptr), @acc[11]
+	sbb	$r_ptr, $r_ptr		# borrow flag as mask
+
+	mov	@acc[0], 32+8*0(%rsp)	# t0
+	mov	@acc[1], 32+8*1(%rsp)
+	mov	@acc[2], 32+8*2(%rsp)
+	mov	@acc[3], 32+8*3(%rsp)
+	mov	@acc[4], 32+8*4(%rsp)
+	mov	@acc[5], 32+8*5(%rsp)
+
+	mov	@acc[6], 32+8*6(%rsp)	# t1
+	mov	@acc[7], 32+8*7(%rsp)
+	mov	@acc[8], 32+8*8(%rsp)
+	mov	@acc[9], 32+8*9(%rsp)
+	mov	@acc[10], 32+8*10(%rsp)
+	mov	@acc[11], 32+8*11(%rsp)
+	mov	$r_ptr,   32+8*12(%rsp)
+
+	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
+	#mov	8*3(%rsp), $a_ptr	# a->re
+	lea	48($a_ptr), $b_ptr	# a->im
+
+	mov	48($a_ptr), %rdx
+	mov	8*0($a_ptr), %r14	# @acc[6]
+	mov	8*1($a_ptr), %r15	# @acc[7]
+	mov	8*2($a_ptr), %rax	# @acc[8]
+	mov	8*3($a_ptr), %r12	# @acc[4]
+	mov	8*4($a_ptr), %rdi	# $lo
+	mov	8*5($a_ptr), %rbp	# $hi
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_383_nonred
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	add	@acc[0], @acc[0]	# add with itself
+	adc	@acc[1], @acc[1]
+	adc	@acc[2], @acc[2]
+	adc	@acc[3], @acc[3]
+	adc	@acc[4], @acc[4]
+	adc	@acc[5], @acc[5]
+
+	mov	@acc[0],  8*6($b_ptr)	# ret->im
+	mov	@acc[1],  8*7($b_ptr)
+	mov	@acc[2],  8*8($b_ptr)
+	mov	@acc[3],  8*9($b_ptr)
+	mov	@acc[4],  8*10($b_ptr)
+	mov	@acc[5],  8*11($b_ptr)
+___
+}
+$code.=<<___;
+	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
+	lea	32-128(%rsp), $a_ptr	# t0 [+u-op density]
+	lea	32+8*6(%rsp), $b_ptr	# t1
+
+	mov	32+8*6(%rsp), %rdx	# t1[0]
+	mov	32+8*0(%rsp), %r14	# @acc[6]
+	mov	32+8*1(%rsp), %r15	# @acc[7]
+	mov	32+8*2(%rsp), %rax	# @acc[8]
+	mov	32+8*3(%rsp), %r12	# @acc[4]
+	mov	32+8*4(%rsp), %rdi	# $lo
+	mov	32+8*5(%rsp), %rbp	# $hi
+	#lea	-128($a_ptr), $a_ptr	# control u-op density
+	#lea	-128($n_ptr), $n_ptr	# control u-op density
+
+	mulx	%r14, %r8, %r9
+	call	__mulx_mont_383_nonred
+___
+{
+my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
+                      8..11,13,14);
+$code.=<<___;
+	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
+	lea	128($n_ptr), $n_ptr
+	mov	32+8*0(%rsp), @acc[6]
+	and	@acc[11], @acc[6]
+	mov	32+8*1(%rsp), @acc[7]
+	and	@acc[11], @acc[7]
+	mov	32+8*2(%rsp), @acc[8]
+	and	@acc[11], @acc[8]
+	mov	32+8*3(%rsp), @acc[9]
+	and	@acc[11], @acc[9]
+	mov	32+8*4(%rsp), @acc[10]
+	and	@acc[11], @acc[10]
+	and	32+8*5(%rsp), @acc[11]
+
+	sub	@acc[6], @acc[0]
+	mov	8*0($n_ptr), @acc[6]
+	sbb	@acc[7], @acc[1]
+	mov	8*1($n_ptr), @acc[7]
+	sbb	@acc[8], @acc[2]
+	mov	8*2($n_ptr), @acc[8]
+	sbb	@acc[9], @acc[3]
+	mov	8*3($n_ptr), @acc[9]
+	sbb	@acc[10], @acc[4]
+	mov	8*4($n_ptr), @acc[10]
+	sbb	@acc[11], @acc[5]
+	sbb	@acc[11], @acc[11]
+
+	and	@acc[11], @acc[6]
+	and	@acc[11], @acc[7]
+	and	@acc[11], @acc[8]
+	and	@acc[11], @acc[9]
+	and	@acc[11], @acc[10]
+	and	8*5($n_ptr), @acc[11]
+
+	add	@acc[6], @acc[0]
+	adc	@acc[7], @acc[1]
+	adc	@acc[8], @acc[2]
+	adc	@acc[9], @acc[3]
+	adc	@acc[10], @acc[4]
+	adc	@acc[11], @acc[5]
+
+	mov	@acc[0],  8*0($b_ptr)	# ret->re
+	mov	@acc[1],  8*1($b_ptr)
+	mov	@acc[2],  8*2($b_ptr)
+	mov	@acc[3],  8*3($b_ptr)
+	mov	@acc[4],  8*4($b_ptr)
+	mov	@acc[5],  8*5($b_ptr)
+___
+}
+$code.=<<___;
+	lea	$frame(%rsp), %r8	# size optimization
+	mov	8*0(%r8),%r15
+.cfi_restore	%r15
+	mov	8*1(%r8),%r14
+.cfi_restore	%r14
+	mov	8*2(%r8),%r13
+.cfi_restore	%r13
+	mov	8*3(%r8),%r12
+.cfi_restore	%r12
+	mov	8*4(%r8),%rbx
+.cfi_restore	%rbx
+	mov	8*5(%r8),%rbp
+.cfi_restore	%rbp
+	lea	8*6(%r8),%rsp
+.cfi_adjust_cfa_offset	-$frame-8*6
+.cfi_epilogue
+	ret
+.cfi_endproc
+.size	sqrx_mont_382x,.-sqrx_mont_382x
+___
+}
+
+print $code;
+close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-armv8.pl b/crypto/blst_src/asm/sha256-armv8.pl
new file mode 100755
index 00000000000..1de27c70667
--- /dev/null
+++ b/crypto/blst_src/asm/sha256-armv8.pl
@@ -0,0 +1,541 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for ARMv8.
+#
+# This module is stripped of scalar code paths, with raionale that all
+# known processors are NEON-capable.
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+$BITS=256;
+$SZ=4;
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+$reg_t="w";
+$pre="blst_";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+$code.=<<___;
+.text
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+.size	.LK$BITS,.-.LK$BITS
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.globl	${pre}sha256_block_armv8
+.type	${pre}sha256_block_armv8,%function
+.align	6
+${pre}sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
+___
+}
+
+if ($SZ==4) {	######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	&ushr_32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	&sli_32		($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ushr_32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&sli_32		($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T4,$T7,$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T4,$T7,32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T5,$T7,$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T3,$T7,$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_u32	($T3,$T7,32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &ushr_32	($T6,@X[0],$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T7,@X[0],$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T6,@X[0],32-$sigma1[0]);
+	 eval(shift(@insns));
+	  &ushr_32	($T5,@X[0],$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T6);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &sli_32	($T5,@X[0],32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl], #16");
+	 eval(shift(@insns));
+	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&eor_8		($T5,$T5,$T5);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&mov		(&Dhi($T5), &Dlo($T7));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 while($#insns>=1) { eval(shift(@insns)); }
+	&st1_32		("{$T0}","[$Xfer], #16");
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_8		("{@X[0]}","[$inp],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&ld1_32		("{$T0}","[$Ktbl],#16");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&rev32		(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&add_32		($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&st1_32		("{$T0}","[$Xfer], #16");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
+	'&and	($t1,$f,$e)',
+	'&bic	($t4,$g,$e)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
+	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ror	($t0,$t0,"#$Sigma1[0]")',
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&ror	($t4,$t4,"#$Sigma0[0]")',
+	'&add	($d,$d,$h)',			# d+=h
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order
+.type	${pre}sha256_block_data_order,%function
+.align	4
+${pre}sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	$Ktbl,.LK256
+	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
+
+	ld1.8	{@X[0]},[$inp], #16
+	ld1.8	{@X[1]},[$inp], #16
+	ld1.8	{@X[2]},[$inp], #16
+	ld1.8	{@X[3]},[$inp], #16
+	ld1.32	{$T0},[$Ktbl], #16
+	ld1.32	{$T1},[$Ktbl], #16
+	ld1.32	{$T2},[$Ktbl], #16
+	ld1.32	{$T3},[$Ktbl], #16
+	rev32	@X[0],@X[0]		// yes, even on
+	rev32	@X[1],@X[1]		// big-endian
+	rev32	@X[2],@X[2]
+	rev32	@X[3],@X[3]
+	mov	$Xfer,sp
+	add.32	$T0,$T0,@X[0]
+	add.32	$T1,$T1,@X[1]
+	add.32	$T2,$T2,@X[2]
+	st1.32	{$T0-$T1},[$Xfer], #32
+	add.32	$T3,$T3,@X[3]
+	st1.32	{$T2-$T3},[$Xfer]
+	sub	$Xfer,$Xfer,#32
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldp	$E,$F,[$ctx,#16]
+	ldp	$G,$H,[$ctx,#24]
+	ldr	$t1,[sp,#0]
+	mov	$t2,wzr
+	eor	$t3,$B,$C
+	mov	$t4,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	cmp	$t1,#0				// check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
+	cmp	$inp,$num
+	mov	$Xfer, #64
+	csel	$Xfer, $Xfer, xzr, eq
+	sub	$inp,$inp,$Xfer			// avoid SEGV
+	mov	$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	add	$A,$A,$t4			// h+=Sigma0(a) from the past
+	ldp	$t0,$t1,[$ctx,#0]
+	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
+	ldp	$t2,$t3,[$ctx,#8]
+	add	$A,$A,$t0			// accumulate
+	add	$B,$B,$t1
+	ldp	$t0,$t1,[$ctx,#16]
+	add	$C,$C,$t2
+	add	$D,$D,$t3
+	ldp	$t2,$t3,[$ctx,#24]
+	add	$E,$E,$t0
+	add	$F,$F,$t1
+	 ldr	$t1,[sp,#0]
+	stp	$A,$B,[$ctx,#0]
+	add	$G,$G,$t2
+	 mov	$t2,wzr
+	stp	$C,$D,[$ctx,#8]
+	add	$H,$H,$t3
+	stp	$E,$F,[$ctx,#16]
+	 eor	$t3,$B,$C
+	stp	$G,$H,[$ctx,#24]
+	 mov	$t4,wzr
+	 mov	$Xfer,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
+___
+}
+
+{
+my ($out,$inp,$len) = map("x$_",(0..2));
+
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,%function
+.align	4
+${pre}sha256_emit:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[$out,#4]
+	lsr	x4,x4,#32
+	str	w5,[$out,#12]
+	lsr	x5,x5,#32
+	str	w6,[$out,#20]
+	lsr	x6,x6,#32
+	str	w7,[$out,#28]
+	lsr	x7,x7,#32
+	str	w4,[$out,#0]
+	str	w5,[$out,#8]
+	str	w6,[$out,#16]
+	str	w7,[$out,#24]
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,%function
+.align	4
+${pre}sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[$inp],#1
+	sub	$len,$len,#1
+	strb	w3,[$out],#1
+	cbnz	$len,.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,%function
+.align	4
+${pre}sha256_hcopy:
+	ldp	x4,x5,[$inp]
+	ldp	x6,x7,[$inp,#16]
+	stp	x4,x5,[$out]
+	stp	x6,x7,[$out,#16]
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/\bext\b/		and s/\.2d/\.16b/g	or
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-portable-x86_64.pl b/crypto/blst_src/asm/sha256-portable-x86_64.pl
new file mode 100755
index 00000000000..eca0564ebe7
--- /dev/null
+++ b/crypto/blst_src/asm/sha256-portable-x86_64.pl
@@ -0,0 +1,337 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# Scalar-only version with minor twist minimizing 'lea' instructions.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+sub ROUND_00_15()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+  my $STRIDE=$SZ;
+  #   $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
+
+$code.=<<___;
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
+	mov	$f,$a2
+
+	xor	$e,$a0
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$g,$a2			# f^g
+
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	xor	$a,$a1
+	and	$e,$a2			# (f^g)&e
+
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
+
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
+	add	$a2,$T1			# T1+=Ch(e,f,g)
+
+	mov	$a,$a2
+	add	`$SZ*$i`($Tbl),$T1	# T1+=K[round]
+	xor	$a,$a1
+
+	xor	$b,$a2			# a^b, b^c in next round
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	mov	$b,$h
+
+	and	$a2,$a3
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+
+	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
+	add	$T1,$d			# d+=T1
+	add	$T1,$h			# h+=T1
+___
+$code.=<<___ if ($i==31);
+	lea	`16*$SZ`($Tbl),$Tbl	# round+=16
+___
+$code.=<<___ if ($i<15);
+	add	$a1,$h			# h+=Sigma0(a)
+___
+	($a2,$a3) = ($a3,$a2);
+}
+
+sub ROUND_16_XX()
+{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
+
+	mov	$a0,$T1
+	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
+	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
+	mov	$a2,$a1
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+
+	xor	$T1,$a0
+	shr	\$$sigma0[2],$T1
+	ror	\$$sigma0[0],$a0
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
+
+	ror	\$$sigma1[0],$a2
+	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
+	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
+	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+
+	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a2,$T1
+	mov	$a,$a1
+___
+	&ROUND_00_15(@_);
+}
+
+$code=<<___;
+.text
+
+.globl	$func
+.type	$func,\@function,3,"unwind"
+.align	16
+$func:
+.cfi_startproc
+	push	%rbx
+.cfi_push	%rbx
+	push	%rbp
+.cfi_push	%rbp
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,$_ctx		# save ctx, 1st arg
+	mov	$inp,$_inp		# save inp, 2nd arh
+	mov	%rdx,$_end		# save end pointer, "3rd" arg
+.cfi_end_prologue
+
+	mov	$SZ*0($ctx),$A
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	mov	$B,$a3
+	lea	$TABLE(%rip),$Tbl
+	xor	$C,$a3			# magic
+___
+	for($i=0;$i<16;$i++) {
+		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
+		$code.="	bswap	$T1\n";
+		&ROUND_00_15($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+$code.=<<___;
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+___
+	for(;$i<32;$i++) {
+		&ROUND_16_XX($i,@ROT);
+		unshift(@ROT,pop(@ROT));
+	}
+
+$code.=<<___;
+	cmpb	\$0x19,`$SZ-1`($Tbl)
+	jnz	.Lrounds_16_xx
+
+	mov	$_ctx,$ctx
+	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
+	lea	16*$SZ($inp),$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop
+
+	lea	$framesz+6*8(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	mov	$framesz(%rsp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbp
+.cfi_restore	%rbp
+	mov	-8(%r11),%rbx
+.cfi_restore	%rbx
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	$func,.-$func
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-x86_64.pl b/crypto/blst_src/asm/sha256-x86_64.pl
new file mode 100755
index 00000000000..22b376318fa
--- /dev/null
+++ b/crypto/blst_src/asm/sha256-x86_64.pl
@@ -0,0 +1,789 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# sha256_block procedure for x86_64.
+#
+# This module is stripped of AVX and even scalar code paths, with
+# raionale that
+#
+# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
+#    processor, venerable Sandy Bridge;
+# b) AVX2 incurs costly power transitions, which would be justifiable
+#    if AVX2 code was executing most of the time, which is not the
+#    case in the context;
+# c) all comtemporary processors support SSSE3, so that nobody would
+#    actually use scalar code path anyway;
+#
+# See original module at CRYPTOGAMS for further details.
+
+$flavour = shift;
+$output  = pop;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+
+$pre="blst_";
+$func="${pre}sha256_block_data_order";
+$TABLE="K256";
+$SZ=4;
+@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
+				"%r8d","%r9d","%r10d","%r11d");
+($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+$rounds=64;
+
+$ctx="%rdi";	# 1st arg, zapped by $a3
+$inp="%rsi";	# 2nd arg
+$Tbl="%rbp";
+
+$_ctx="16*$SZ+0*8(%rsp)";
+$_inp="16*$SZ+1*8(%rsp)";
+$_end="16*$SZ+2*8(%rsp)";
+$framesz="16*$SZ+3*8";
+
+$code=<<___;
+.text
+
+.align	64
+.type	$TABLE,\@object
+$TABLE:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
+___
+
+######################################################################
+# SIMD code paths
+#
+{{{
+######################################################################
+# Intel SHA Extensions implementation of SHA256 update function.
+#
+my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
+
+my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
+my @MSG=map("%xmm$_",(3..6));
+
+$code.=<<___;
+.globl	${pre}sha256_block_data_order_shaext
+.hidden	${pre}sha256_block_data_order_shaext
+.type	${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
+.align	64
+${pre}sha256_block_data_order_shaext:
+.cfi_startproc
+___
+$code.=<<___ if ($win64);
+	sub	\$0x58,%rsp
+.cfi_adjust_cfa_offset	0x58
+	movaps	%xmm6,-0x58(%r11)
+.cfi_offset	%xmm6,-0x60
+	movaps	%xmm7,-0x48(%r11)
+.cfi_offset	%xmm7,-0x50
+	movaps	%xmm8,-0x38(%r11)
+.cfi_offset	%xmm8,-0x40
+	movaps	%xmm9,-0x28(%r11)
+.cfi_offset	%xmm9,-0x30
+	movaps	%xmm10,-0x18(%r11)
+.cfi_offset	%xmm10,-0x20
+.cfi_end_prologue
+___
+$code.=<<___;
+	lea		K256+0x80(%rip),$Tbl
+	movdqu		($ctx),$ABEF		# DCBA
+	movdqu		16($ctx),$CDGH		# HGFE
+	movdqa		0x100-0x80($Tbl),$TMP	# byte swap mask
+
+	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
+	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
+	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
+	movdqa		$TMP,$BSWAP		# offload
+	palignr		\$8,$CDGH,$ABEF		# ABEF
+	punpcklqdq	$Wi,$CDGH		# CDGH
+	jmp		.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu		($inp),@MSG[0]
+	movdqu		0x10($inp),@MSG[1]
+	movdqu		0x20($inp),@MSG[2]
+	pshufb		$TMP,@MSG[0]
+	movdqu		0x30($inp),@MSG[3]
+
+	movdqa		0*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	pshufb		$TMP,@MSG[1]
+	movdqa		$CDGH,$CDGH_SAVE	# offload
+	sha256rnds2	$ABEF,$CDGH		# 0-3
+	pshufd		\$0x0e,$Wi,$Wi
+	nop
+	movdqa		$ABEF,$ABEF_SAVE	# offload
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		1*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	pshufb		$TMP,@MSG[2]
+	sha256rnds2	$ABEF,$CDGH		# 4-7
+	pshufd		\$0x0e,$Wi,$Wi
+	lea		0x40($inp),$inp
+	sha256msg1	@MSG[1],@MSG[0]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		2*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	pshufb		$TMP,@MSG[3]
+	sha256rnds2	$ABEF,$CDGH		# 8-11
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[3],$TMP
+	palignr		\$4,@MSG[2],$TMP
+	nop
+	paddd		$TMP,@MSG[0]
+	sha256msg1	@MSG[2],@MSG[1]
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		3*16-0x80($Tbl),$Wi
+	paddd		@MSG[3],$Wi
+	sha256msg2	@MSG[3],@MSG[0]
+	sha256rnds2	$ABEF,$CDGH		# 12-15
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[0],$TMP
+	palignr		\$4,@MSG[3],$TMP
+	nop
+	paddd		$TMP,@MSG[1]
+	sha256msg1	@MSG[3],@MSG[2]
+	sha256rnds2	$CDGH,$ABEF
+___
+for($i=4;$i<16-3;$i++) {
+$code.=<<___;
+	movdqa		$i*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 16-19...
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	nop
+	paddd		$TMP,@MSG[2]
+	sha256msg1	@MSG[0],@MSG[3]
+	sha256rnds2	$CDGH,$ABEF
+___
+	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	movdqa		13*16-0x80($Tbl),$Wi
+	paddd		@MSG[0],$Wi
+	sha256msg2	@MSG[0],@MSG[1]
+	sha256rnds2	$ABEF,$CDGH		# 52-55
+	pshufd		\$0x0e,$Wi,$Wi
+	movdqa		@MSG[1],$TMP
+	palignr		\$4,@MSG[0],$TMP
+	sha256rnds2	$CDGH,$ABEF
+	paddd		$TMP,@MSG[2]
+
+	movdqa		14*16-0x80($Tbl),$Wi
+	paddd		@MSG[1],$Wi
+	sha256rnds2	$ABEF,$CDGH		# 56-59
+	pshufd		\$0x0e,$Wi,$Wi
+	sha256msg2	@MSG[1],@MSG[2]
+	movdqa		$BSWAP,$TMP
+	sha256rnds2	$CDGH,$ABEF
+
+	movdqa		15*16-0x80($Tbl),$Wi
+	paddd		@MSG[2],$Wi
+	nop
+	sha256rnds2	$ABEF,$CDGH		# 60-63
+	pshufd		\$0x0e,$Wi,$Wi
+	dec		$num
+	nop
+	sha256rnds2	$CDGH,$ABEF
+
+	paddd		$CDGH_SAVE,$CDGH
+	paddd		$ABEF_SAVE,$ABEF
+	jnz		.Loop_shaext
+
+	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
+	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
+	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
+	punpckhqdq	$CDGH,$ABEF		# DCBA
+	palignr		\$8,$TMP,$CDGH		# HGFE
+
+	movdqu	$ABEF,($ctx)
+	movdqu	$CDGH,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	-0x58(%r11),%xmm6
+	movaps	-0x48(%r11),%xmm7
+	movaps	-0x38(%r11),%xmm8
+	movaps	-0x28(%r11),%xmm9
+	movaps	-0x18(%r11),%xmm10
+	mov	%r11,%rsp
+.cfi_def_cfa	%r11,8
+.cfi_epilogue
+___
+$code.=<<___;
+	ret
+.cfi_endproc
+.size	${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
+___
+}}}
+{{{
+
+my $a4=$T1;
+my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
+
+	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
+	'&mov	($a,$a1)',
+	'&mov	($a4,$f)',
+
+	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
+	'&xor	($a0,$e)',
+	'&xor	($a4,$g)',			# f^g
+
+	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
+	'&xor	($a1,$a)',
+	'&and	($a4,$e)',			# (f^g)&e
+
+	'&xor	($a0,$e)',
+	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
+	'&mov	($a2,$a)',
+
+	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
+	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
+	'&xor	($a2,$b)',			# a^b, b^c in next round
+
+	'&add	($h,$a4)',			# h+=Ch(e,f,g)
+	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
+	'&and	($a3,$a2)',			# (b^c)&(a^b)
+
+	'&xor	($a1,$a)',
+	'&add	($h,$a0)',			# h+=Sigma1(e)
+	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
+
+	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
+	'&add	($d,$h)',			# d+=h
+	'&add	($h,$a3)',			# h+=Maj(a,b,c)
+
+	'&mov	($a0,$d)',
+	'&add	($a1,$h);'.			# h+=Sigma0(a)
+	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
+	);
+}
+
+######################################################################
+# SSSE3 code path
+#
+{
+my $Tbl = $inp;
+my $_ctx="0(%rbp)";
+my $_inp="8(%rbp)";
+my $_end="16(%rbp)";
+my $framesz=4*8+$win64*16*4+8;
+
+my @X = map("%xmm$_",(0..3));
+my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
+
+$code.=<<___;
+.globl	${func}
+.hidden	${func}
+.type	${func},\@function,3,"unwind"
+.align	64
+${func}:
+.cfi_startproc
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+	shl	\$4,%rdx		# num*16
+	sub	\$$framesz,%rsp
+.cfi_adjust_cfa_offset	$framesz
+	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
+	mov	$ctx,0(%rsp)		# save ctx, 1st arg
+	#mov	$inp,8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16(%rsp)		# save end pointer, "3rd" arg
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x20(%rsp)
+.cfi_offset	%xmm6,-0x78
+	movaps	%xmm7,0x30(%rsp)
+.cfi_offset	%xmm7,-0x68
+	movaps	%xmm8,0x40(%rsp)
+.cfi_offset	%xmm8,-0x58
+	movaps	%xmm9,0x50(%rsp)
+.cfi_offset	%xmm9,-0x48
+___
+$code.=<<___;
+	mov	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+.cfi_end_prologue
+
+	lea	-16*$SZ(%rsp),%rsp
+	mov	$SZ*0($ctx),$A
+	and	\$-64,%rsp		# align stack
+	mov	$SZ*1($ctx),$B
+	mov	$SZ*2($ctx),$C
+	mov	$SZ*3($ctx),$D
+	mov	$SZ*4($ctx),$E
+	mov	$SZ*5($ctx),$F
+	mov	$SZ*6($ctx),$G
+	mov	$SZ*7($ctx),$H
+___
+
+$code.=<<___;
+	#movdqa	$TABLE+`$SZ*$rounds`+32(%rip),$t4
+	#movdqa	$TABLE+`$SZ*$rounds`+64(%rip),$t5
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	$TABLE+`$SZ*$rounds`(%rip),$t3
+	mov	$inp,$_inp		# offload $inp
+	movdqu	0x00($inp),@X[0]
+	movdqu	0x10($inp),@X[1]
+	movdqu	0x20($inp),@X[2]
+	pshufb	$t3,@X[0]
+	movdqu	0x30($inp),@X[3]
+	lea	$TABLE(%rip),$Tbl
+	pshufb	$t3,@X[1]
+	movdqa	0x00($Tbl),$t0
+	movdqa	0x10($Tbl),$t1
+	pshufb	$t3,@X[2]
+	paddd	@X[0],$t0
+	movdqa	0x20($Tbl),$t2
+	pshufb	$t3,@X[3]
+	movdqa	0x30($Tbl),$t3
+	paddd	@X[1],$t1
+	paddd	@X[2],$t2
+	paddd	@X[3],$t3
+	movdqa	$t0,0x00(%rsp)
+	mov	$A,$a1
+	movdqa	$t1,0x10(%rsp)
+	mov	$B,$a3
+	movdqa	$t2,0x20(%rsp)
+	xor	$C,$a3			# magic
+	movdqa	$t3,0x30(%rsp)
+	mov	$E,$a0
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	sub	\$`-16*$SZ`,$Tbl	# size optimization
+___
+sub Xupdate_256_SSSE3 () {
+	(
+	'&movdqa	($t0,@X[1]);',
+	'&movdqa	($t3,@X[3])',
+	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
+	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
+	'&movdqa	($t1,$t0)',
+	'&movdqa	($t2,$t0);',
+	'&psrld		($t0,$sigma0[2])',
+	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
+	'&psrld		($t2,$sigma0[0])',
+	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
+	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
+	'&pxor		($t0,$t2)',
+	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t1)',
+	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
+	'&pxor		($t0,$t2);',
+	 '&movdqa	($t2,$t3)',
+	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
+	 '&psrld	($t3,$sigma1[2])',
+	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2)',
+	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
+	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
+	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
+	 '&movdqa	($t2,$t3);',
+	 '&psrld	($t3,$sigma1[2])',
+	 '&psrlq	($t2,$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
+	 '&pxor		($t3,$t2);',
+	'&movdqa	($t2,16*$j."($Tbl)")',
+	 '&pshufb	($t3,$t5)',
+	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
+	);
+}
+
+sub SSSE3_256_00_47 () {
+my $j = shift;
+my $body = shift;
+my @X = @_;
+my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
+
+    if (0) {
+	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
+	    eval;
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	    eval(shift(@insns));
+	}
+    } else {			# squeeze extra 4% on Westmere and 19% on Atom
+	  eval(shift(@insns));	#@
+	&movdqa		($t0,@X[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t3,@X[3]);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&palignr	($t0,@X[0],$SZ);	# X[1..4]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&movdqa		($t1,$t0);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,$t0);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t0,$sigma0[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	&psrld		($t2,$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&pslld		($t1,8*$SZ-$sigma0[1]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&psrld		($t2,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pslld		($t1,$sigma0[1]-$sigma0[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&pxor		($t0,$t1);		# sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
+	 &pshufd	($t3,$t3,0b10000000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &movdqa	($t2,$t3);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &psrld		($t3,$sigma1[2]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 &psrlq		($t2,$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	 &pxor		($t3,$t2);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));	#@
+	 #&pshufb	($t3,$t5);
+	 &pshufd	($t3,$t3,0b00001000);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&movdqa		($t2,16*$j."($Tbl)");
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	 &pslldq	($t3,8);
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
+	  eval(shift(@insns));	#@
+	  eval(shift(@insns));
+	  eval(shift(@insns));
+    }
+	&paddd		($t2,@X[0]);
+	  foreach (@insns) { eval; }		# remaining instructions
+	&movdqa		(16*$j."(%rsp)",$t2);
+}
+
+    for ($i=0,$j=0; $j<4; $j++) {
+	&SSSE3_256_00_47($j,\&body_00_15,@X);
+	push(@X,shift(@X));			# rotate(@X)
+    }
+	&cmpb	($SZ-1+16*$SZ."($Tbl)",0);
+	&jne	(".Lssse3_00_47");
+
+    for ($i=0; $i<16; ) {
+	foreach(body_00_15()) { eval; }
+    }
+$code.=<<___;
+	mov	$_ctx,$ctx
+	mov	$a1,$A
+	mov	$_inp,$inp
+
+	add	$SZ*0($ctx),$A
+	add	$SZ*1($ctx),$B
+	add	$SZ*2($ctx),$C
+	add	$SZ*3($ctx),$D
+	add	$SZ*4($ctx),$E
+	add	$SZ*5($ctx),$F
+	add	$SZ*6($ctx),$G
+	add	$SZ*7($ctx),$H
+
+	lea	16*$SZ($inp),$inp
+	cmp	$_end,$inp
+
+	mov	$A,$SZ*0($ctx)
+	mov	$B,$SZ*1($ctx)
+	mov	$C,$SZ*2($ctx)
+	mov	$D,$SZ*3($ctx)
+	mov	$E,$SZ*4($ctx)
+	mov	$F,$SZ*5($ctx)
+	mov	$G,$SZ*6($ctx)
+	mov	$H,$SZ*7($ctx)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0, %xmm0
+	lea	$framesz+6*8(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0, 0x00(%rsp)	# scrub the stack
+	movaps	%xmm0, 0x10(%rsp)
+	movaps	%xmm0, 0x20(%rsp)
+	movaps	%xmm0, 0x30(%rsp)
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rbp),%xmm6
+	movaps	0x30(%rbp),%xmm7
+	movaps	0x40(%rbp),%xmm8
+	movaps	0x50(%rbp),%xmm9
+___
+$code.=<<___;
+	mov	$framesz(%rbp),%r15
+.cfi_restore	%r15
+	mov	-40(%r11),%r14
+.cfi_restore	%r14
+	mov	-32(%r11),%r13
+.cfi_restore	%r13
+	mov	-24(%r11),%r12
+.cfi_restore	%r12
+	mov	-16(%r11),%rbx
+.cfi_restore	%rbx
+	mov	-8(%r11),%rbp
+.cfi_restore	%rbp
+.cfi_epilogue
+	lea	(%r11),%rsp
+	ret
+.cfi_endproc
+.size	${func},.-${func}
+___
+}
+}}}
+{
+my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
+                               ("%rdi","%rsi","%rdx");  # Unix order
+$code.=<<___;
+.globl	${pre}sha256_emit
+.hidden	${pre}sha256_emit
+.type	${pre}sha256_emit,\@abi-omnipotent
+.align	16
+${pre}sha256_emit:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	bswap	%r8
+	mov	24($inp), %r11
+	bswap	%r9
+	mov	%r8d, 4($out)
+	bswap	%r10
+	mov	%r9d, 12($out)
+	bswap	%r11
+	mov	%r10d, 20($out)
+	shr	\$32, %r8
+	mov	%r11d, 28($out)
+	shr	\$32, %r9
+	mov	%r8d, 0($out)
+	shr	\$32, %r10
+	mov	%r9d, 8($out)
+	shr	\$32, %r11
+	mov	%r10d, 16($out)
+	mov	%r11d, 24($out)
+	ret
+.size	${pre}sha256_emit,.-${pre}sha256_emit
+
+.globl	${pre}sha256_bcopy
+.hidden	${pre}sha256_bcopy
+.type	${pre}sha256_bcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_bcopy:
+	sub	$inp, $out
+.Loop_bcopy:
+	movzb	($inp), %eax
+	lea	1($inp), $inp
+	mov	%al, -1($out,$inp)
+	dec	$len
+	jnz	.Loop_bcopy
+	ret
+.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
+
+.globl	${pre}sha256_hcopy
+.hidden	${pre}sha256_hcopy
+.type	${pre}sha256_hcopy,\@abi-omnipotent
+.align	16
+${pre}sha256_hcopy:
+	mov	0($inp), %r8
+	mov	8($inp), %r9
+	mov	16($inp), %r10
+	mov	24($inp), %r11
+	mov	%r8, 0($out)
+	mov	%r9, 8($out)
+	mov	%r10, 16($out)
+	mov	%r11, 24($out)
+	ret
+.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
+___
+}
+
+sub sha256op38 {
+    my $instr = shift;
+    my %opcodelet = (
+		"sha256rnds2" => 0xcb,
+  		"sha256msg1"  => 0xcc,
+		"sha256msg2"  => 0xcd	);
+
+    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+      my @opcode=(0x0f,0x38);
+	push @opcode,$opcodelet{$instr};
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	return ".byte\t".join(',',@opcode);
+    } else {
+	return $instr."\t".@_[0];
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/crypto/blst_src/asm/x86_64-xlate.pl b/crypto/blst_src/asm/x86_64-xlate.pl
new file mode 100755
index 00000000000..62be619d9fc
--- /dev/null
+++ b/crypto/blst_src/asm/x86_64-xlate.pl
@@ -0,0 +1,1781 @@
+#!/usr/bin/env perl
+#
+# Copyright Supranational LLC
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm.
+#
+# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
+# format is way easier to parse. Because it's simpler to "gear" from
+# Unix ABI to Windows one [see cross-reference "card" at the end of
+# file]. Because Linux targets were available first...
+#
+# In addition the script also "distills" code suitable for GNU
+# assembler, so that it can be compiled with more rigid assemblers,
+# such as Solaris /usr/ccs/bin/as.
+#
+# This translator is not designed to convert *arbitrary* assembler
+# code from AT&T format to MASM one. It's designed to convert just
+# enough to provide for dual-ABI OpenSSL modules development...
+# There *are* limitations and you might have to modify your assembler
+# code or this script to achieve the desired result...
+#
+# Currently recognized limitations:
+#
+# - can't use multiple ops per line;
+#
+# Dual-ABI styling rules.
+#
+# 1. Adhere to Unix register and stack layout [see cross-reference
+#    ABI "card" at the end for explanation].
+# 2. Forget about "red zone," stick to more traditional blended
+#    stack frame allocation. If volatile storage is actually required
+#    that is. If not, just leave the stack as is.
+# 3. Functions tagged with ".type name,@function" get crafted with
+#    unified Win64 prologue and epilogue automatically. If you want
+#    to take care of ABI differences yourself, tag functions as
+#    ".type name,@abi-omnipotent" instead.
+# 4. To optimize the Win64 prologue you can specify number of input
+#    arguments as ".type name,@function,N." Keep in mind that if N is
+#    larger than 6, then you *have to* write "abi-omnipotent" code,
+#    because >6 cases can't be addressed with unified prologue.
+# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
+#    (sorry about latter).
+# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
+#    required to identify the spots, where to inject Win64 epilogue!
+#    But on the pros, it's then prefixed with rep automatically:-)
+# 7. Stick to explicit ip-relative addressing. If you have to use
+#    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
+#    Both are recognized and translated to proper Win64 addressing
+#    modes.
+#
+# 8. In order to provide for structured exception handling unified
+#    Win64 prologue copies %rsp value to %rax. [Unless function is
+#    tagged with additional .type tag.] For further details see SEH
+#    paragraph at the end.
+# 9. .init segment is allowed to contain calls to functions only.
+# a. If function accepts more than 4 arguments *and* >4th argument
+#    is declared as non 64-bit value, do clear its upper part.
+
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
+
+my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
+my $elf=1;	$elf=0 if (!$gas);
+my $dwarf=$elf;
+my $win64=0;
+my $prefix="";
+my $decor=".L";
+
+my $masmref=8 + 50727*2**-32;	# 8.00.50727 shipped with VS2005
+my $masm=0;
+my $PTR=" PTR";
+
+my $nasmref=2.03;
+my $nasm=0;
+
+if    ($flavour eq "mingw64")	{ $gas=1; $elf=0; $win64=1;
+				  $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`;
+				  $prefix =~ s|\R$||; # Better chomp
+				}
+elsif ($flavour eq "macosx")	{ $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
+elsif ($flavour eq "masm")	{ $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
+elsif ($flavour eq "nasm")	{ $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
+elsif (!$gas)
+{   if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
+    {	$nasm = $1 + $2*0.01; $PTR="";  }
+    elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
+    {	$masm = $1 + $2*2**-16 + $4*2**-32;   }
+    die "no assembler found on %PATH%" if (!($nasm || $masm));
+    $win64=1;
+    $elf=0;
+    $decor="\$L\$";
+}
+
+$dwarf=0 if($win64);
+
+my $current_segment;
+my $current_function;
+my %globals;
+
+{ package opcode;	# pick up opcodes
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^([a-z][a-z0-9]*)/i) {
+	    bless $self,$class;
+	    $self->{op} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    undef $self->{sz};
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    } elsif ($self->{op} =~ /cmov[n]?[lb]$/) {
+		# pass through
+	    } elsif ($self->{op} =~ /call|jmp/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
+		$self->{op} = $1;
+		$self->{sz} = $2;
+	    }
+	}
+	$ret;
+    }
+    sub size {
+	my ($self, $sz) = @_;
+	$self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
+	$self->{sz};
+    }
+    sub out {
+	my $self = shift;
+	if ($gas) {
+	    if ($self->{op} eq "movz") {	# movz is pain...
+		sprintf "%s%s%s",$self->{op},$self->{sz},shift;
+	    } elsif ($self->{op} =~ /^set/) {
+		"$self->{op}";
+	    } elsif ($self->{op} eq "ret") {
+		my $epilogue = "";
+		if ($win64 && $current_function->{abi} eq "svr4"
+			   && !$current_function->{unwind}) {
+		    $epilogue = "movq	8(%rsp),%rdi\n\t" .
+				"movq	16(%rsp),%rsi\n\t";
+		}
+		$epilogue . ".byte	0xf3,0xc3";
+	    } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
+		".p2align\t3\n\t.quad";
+	    } else {
+		"$self->{op}$self->{sz}";
+	    }
+	} else {
+	    $self->{op} =~ s/^movz/movzx/;
+	    if ($self->{op} eq "ret") {
+		$self->{op} = "";
+		if ($win64 && $current_function->{abi} eq "svr4"
+			   && !$current_function->{unwind}) {
+		    $self->{op} = "mov	rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
+				  "mov	rsi,QWORD$PTR\[16+rsp\]\n\t";
+	    	}
+		$self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
+	    } elsif ($self->{op} =~ /^(pop|push)f/) {
+		$self->{op} .= $self->{sz};
+	    } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
+		$self->{op} = "\tDQ";
+	    }
+	    $self->{op};
+	}
+    }
+    sub mnemonic {
+	my ($self, $op) = @_;
+	$self->{op}=$op if (defined($op));
+	$self->{op};
+    }
+}
+{ package const;	# pick up constants, which start with $
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /^\$([^,]+)/) {
+	    bless $self, $class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+
+	$self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
+	if ($gas) {
+	    # Solaris /usr/ccs/bin/as can't handle multiplications
+	    # in $self->{value}
+	    my $value = $self->{value};
+	    no warnings;    # oct might complain about overflow, ignore here...
+	    $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	    if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
+		$self->{value} = $value;
+	    }
+	    sprintf "\$%s",$self->{value};
+	} else {
+	    my $value = $self->{value};
+	    $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
+	    sprintf "%s",$value;
+	}
+    }
+}
+{ package ea;		# pick up effective addresses: expr(%reg,%reg,scale)
+
+    my %szmap = (	b=>"BYTE$PTR",    w=>"WORD$PTR",
+			l=>"DWORD$PTR",   d=>"DWORD$PTR",
+			q=>"QWORD$PTR",   o=>"OWORD$PTR",
+			x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
+			z=>"ZMMWORD$PTR" ) if (!$gas);
+
+    my %sifmap = (	ss=>"d",	sd=>"q",	# broadcast only
+			i32x2=>"q",	f32x2=>"q",
+			i32x4=>"x",	i64x2=>"x",	i128=>"x",
+			f32x4=>"x",	f64x2=>"x",	f128=>"x",
+			i32x8=>"y",	i64x4=>"y",
+			f32x8=>"y",	f64x4=>"y" ) if (!$gas);
+
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
+	    bless $self, $class;
+	    $self->{asterisk} = $1;
+	    $self->{label} = $2;
+	    ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
+	    $self->{scale} = 1 if (!defined($self->{scale}));
+	    $self->{opmask} = $4;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
+		die if ($opcode->mnemonic() ne "mov");
+		$opcode->mnemonic("lea");
+	    }
+	    $self->{base}  =~ s/^%//;
+	    $self->{index} =~ s/^%// if (defined($self->{index}));
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub size {}
+    sub out {
+	my ($self, $sz) = @_;
+
+	$self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	$self->{label} =~ s/\.L/$decor/g;
+
+	# Silently convert all EAs to 64-bit. This is required for
+	# elder GNU assembler and results in more compact code,
+	# *but* most importantly AES module depends on this feature!
+	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
+
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+
+	# Some assemblers insist on signed presentation of 32-bit
+	# offsets, but sign extension is a tricky business in perl...
+	$self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg;
+
+	# if base register is %rbp or %r13, see if it's possible to
+	# flip base and index registers [for better performance]
+	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+	    $self->{base} =~ /(rbp|r13)/) {
+		$self->{base} = $self->{index}; $self->{index} = $1;
+	}
+
+	if ($gas) {
+	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
+
+	    if (defined($self->{index})) {
+		sprintf "%s%s(%s,%%%s,%d)%s",
+					$self->{asterisk},$self->{label},
+					$self->{base}?"%$self->{base}":"",
+					$self->{index},$self->{scale},
+					$self->{opmask};
+	    } else {
+		sprintf "%s%s(%%%s)%s",	$self->{asterisk},$self->{label},
+					$self->{base},$self->{opmask};
+	    }
+	} else {
+	    $self->{label} =~ s/\./\$/g;
+	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
+	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
+
+	    my $mnemonic = $self->{opcode}->mnemonic();
+	    ($self->{asterisk})				&& ($sz="q") ||
+	    ($mnemonic =~ /^v?mov([qd])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v?pinsr([qdwb])$/)		&& ($sz=$1)  ||
+	    ($mnemonic =~ /^vpbroadcast([qdwb])$/)	&& ($sz=$1)  ||
+	    ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/)
+							&& ($sz=$sifmap{$1});
+
+	    $self->{opmask}  =~ s/%(k[0-7])/$1/;
+
+	    if (defined($self->{index})) {
+		sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{index},$self->{scale},
+					$self->{base}?"+$self->{base}":"",
+					$self->{opmask};
+	    } elsif ($self->{base} eq "rip") {
+		sprintf "%s[%s]",$szmap{$sz},$self->{label};
+	    } else {
+		sprintf "%s[%s%s]%s",	$szmap{$sz},
+					$self->{label}?"$self->{label}+":"",
+					$self->{base},$self->{opmask};
+	    }
+	}
+    }
+}
+{ package register;	# pick up registers, which start with %.
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	# optional * ----vvv--- appears in indirect jmp/call
+	if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
+	    bless $self,$class;
+	    $self->{asterisk} = $1;
+	    $self->{value} = $2;
+	    $self->{opmask} = $3;
+	    $opcode->size($self->size());
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+	}
+	$ret;
+    }
+    sub size {
+	my	$self = shift;
+	my	$ret;
+
+	if    ($self->{value} =~ /^r[\d]+b$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^r[\d]+w$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^r[\d]+d$/i)	{ $ret="l"; }
+	elsif ($self->{value} =~ /^r[\w]+$/i)	{ $ret="q"; }
+	elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}l$/i)	{ $ret="b"; }
+	elsif ($self->{value} =~ /^[\w]{2}$/i)	{ $ret="w"; }
+	elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
+
+	$ret;
+    }
+    sub out {
+    	my $self = shift;
+	if ($gas)	{ sprintf "%s%%%s%s",	$self->{asterisk},
+						$self->{value},
+						$self->{opmask}; }
+	else		{ $self->{opmask} =~ s/%(k[0-7])/$1/;
+			  $self->{value}.$self->{opmask}; }
+    }
+}
+{ package label;	# pick up labels, which end with :
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[\.\w]+)\:/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/^\.L/$decor/;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+
+	if ($gas) {
+	    my $func = ($globals{$self->{value}} or $self->{value}) . ":";
+	    if ($current_function->{name} eq $self->{value}) {
+		$func .= "\n.cfi_".cfi_directive::startproc()   if ($dwarf);
+		$func .= "\n	.byte	0xf3,0x0f,0x1e,0xfa\n";	# endbranch
+		if ($win64 && $current_function->{abi} eq "svr4") {
+		    my $fp = $current_function->{unwind} ? "%r11" : "%rax";
+		    $func .= "	movq	%rdi,8(%rsp)\n";
+		    $func .= "	movq	%rsi,16(%rsp)\n";
+		    $func .= "	movq	%rsp,$fp\n";
+		    $func .= "${decor}SEH_begin_$current_function->{name}:\n";
+		    my $narg = $current_function->{narg};
+		    $narg=6 if (!defined($narg));
+		    $func .= "	movq	%rcx,%rdi\n" if ($narg>0);
+		    $func .= "	movq	%rdx,%rsi\n" if ($narg>1);
+		    $func .= "	movq	%r8,%rdx\n"  if ($narg>2);
+		    $func .= "	movq	%r9,%rcx\n"  if ($narg>3);
+		    $func .= "	movq	40(%rsp),%r8\n" if ($narg>4);
+		    $func .= "	movq	48(%rsp),%r9\n" if ($narg>5);
+		}
+	    }
+	    $func;
+	} elsif ($self->{value} ne "$current_function->{name}") {
+	    # Make all labels in masm global.
+	    $self->{value} .= ":" if ($masm);
+	    $self->{value} . ":";
+	} elsif ($win64 && $current_function->{abi} eq "svr4") {
+	    my $func =	"$current_function->{name}" .
+			($nasm ? ":" : "\tPROC $current_function->{scope}") .
+			"\n";
+	    my $fp = $current_function->{unwind} ? "r11" : "rax";
+	    $func .= "	DB	243,15,30,250\n";	# endbranch
+	    $func .= "	mov	QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
+	    $func .= "	mov	QWORD$PTR\[16+rsp\],rsi\n";
+	    $func .= "	mov	$fp,rsp\n";
+	    $func .= "${decor}SEH_begin_$current_function->{name}:";
+	    $func .= ":" if ($masm);
+	    $func .= "\n";
+	    my $narg = $current_function->{narg};
+	    $narg=6 if (!defined($narg));
+	    $func .= "	mov	rdi,rcx\n" if ($narg>0);
+	    $func .= "	mov	rsi,rdx\n" if ($narg>1);
+	    $func .= "	mov	rdx,r8\n"  if ($narg>2);
+	    $func .= "	mov	rcx,r9\n"  if ($narg>3);
+	    $func .= "	mov	r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
+	    $func .= "	mov	r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
+	    $func .= "\n";
+	} else {
+	   "$current_function->{name}".
+			($nasm ? ":" : "\tPROC $current_function->{scope}").
+	   "\n	DB	243,15,30,250";			# endbranch
+	}
+    }
+}
+{ package expr;		# pick up expressions
+    sub re {
+	my	($class, $line, $opcode) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ /(^[^,]+)/) {
+	    bless $self,$class;
+	    $self->{value} = $1;
+	    $ret = $self;
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    $self->{value} =~ s/\@PLT// if (!$elf);
+	    $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+	    $self->{value} =~ s/\.L/$decor/g;
+	    $self->{opcode} = $opcode;
+	}
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	$self->{value};
+    }
+}
+
+my @xdata_seg = (".section	.xdata", ".align	8");
+my @pdata_seg = (".section	.pdata", ".align	4");
+
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+    #
+    # In addition the .cfi directives are re-purposed even for Win64
+    # stack unwinding. Two more synthetic directives were added:
+    #
+    # - .cfi_end_prologue to denote point when all non-volatile
+    #   registers are saved and stack or [chosen] frame pointer is
+    #   stable;
+    # - .cfi_epilogue to denote point when all non-volatile registers
+    #   are restored [and it even adds missing .cfi_restore-s];
+    #
+    # Though it's not universal "miracle cure," it has its limitations.
+    # Most notably .cfi_cfa_expression won't start working... For more
+    # information see the end of this file.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (	# no-arg operators, mapped directly
+	deref	=> 0x06,	dup	=> 0x12,
+	drop	=> 0x13,	over	=> 0x14,
+	pick	=> 0x15,	swap	=> 0x16,
+	rot	=> 0x17,	xderef	=> 0x18,
+
+	abs	=> 0x19,	and	=> 0x1a,
+	div	=> 0x1b,	minus	=> 0x1c,
+	mod	=> 0x1d,	mul	=> 0x1e,
+	neg	=> 0x1f,	not	=> 0x20,
+	or	=> 0x21,	plus	=> 0x22,
+	shl	=> 0x24,	shr	=> 0x25,
+	shra	=> 0x26,	xor	=> 0x27,
+	);
+
+    my %DW_OP_complex = (	# used in specific subroutines
+	constu		=> 0x10,	# uleb128
+	consts		=> 0x11,	# sleb128
+	plus_uconst	=> 0x23,	# uleb128
+	lit0 		=> 0x30,	# add 0-31 to opcode
+	reg0		=> 0x50,	# add 0-31 to opcode
+	breg0		=> 0x70,	# add 0-31 to opcole, sleb128
+	regx		=> 0x90,	# uleb28
+	fbreg		=> 0x91,	# sleb128
+	bregx		=> 0x92,	# uleb128, sleb128
+	piece		=> 0x93,	# uleb128
+	);
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+	"%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+	"%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+
+    my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs);
+    my @cfa_stack;
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significant digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+	use integer;	# get right shift extend sign
+
+	my $val = shift;
+	my $sign = ($val < 0) ? -1 : 0;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if remaining bits are same and equal to most
+	    # significant bit of the current digit, if so, it's
+	    # last digit...
+	    last if (($val>>6) == $sign);
+
+	    @ret[-1] |= 0x80;
+	    $val >>= 7;
+	}
+
+	return @ret;
+    }
+    sub uleb128 {
+	my $val = shift;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if it's last significant digit...
+	    last if (($val >>= 7) == 0);
+
+	    @ret[-1] |= 0x80;
+	}
+
+	return @ret;
+    }
+    sub const {
+	my $val = shift;
+
+	if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+	}
+	return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+	my $val = shift;
+
+	return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+	my $reg = $DW_reg_idx{$1};
+	my $off = eval ("0 $2 $3");
+
+	return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+	# Yes, we use DW_OP_bregX+0 to push register value and not
+	# DW_OP_regX, because latter would require even DW_OP_piece,
+	# which would be a waste under the circumstances. If you have
+	# to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+	my $line = shift;
+	my @ret;
+
+	foreach my $token (split(/,\s*/,$line)) {
+	    if ($token =~ /^%r/) {
+		push @ret,reg($token);
+	    } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+		push @ret,reg("$2+$1");
+	    } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+		my $i = 1*eval($2);
+		push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+	    } elsif (my $i = 1*eval($token) or $token eq "0") {
+		if ($token =~ /^\+/) {
+		    push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+		} else {
+		    push @ret,const($i);
+		}
+	    } else {
+		push @ret,$DW_OP_simple{$token};
+	    }
+	}
+
+	# Finally we return DW_CFA_def_cfa_expression, 15, followed by
+	# length of the expression and of course the expression itself.
+	return (15,scalar(@ret),@ret);
+    }
+
+    # Following constants are defined in "x64 exception handling" at
+    # https://docs.microsoft.com/ and match the register sequence in
+    # CONTEXT structure defined in winnt.h.
+    my %WIN64_reg_idx = (
+	"%rax"=>0,  "%rcx"=>1,  "%rdx"=>2,  "%rbx"=>3,
+	"%rsp"=>4,  "%rbp"=>5,  "%rsi"=>6,  "%rdi"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+    sub xdata {
+	our @dat = ();
+	our $len = 0;
+
+	sub allocstack {
+	    my $offset = shift;
+
+	    if ($offset) {
+		if ($offset <= 128) {
+	            $offset = ($offset - 8) >> 3;
+		    push @dat, [0,$offset<<4|2];	# UWOP_ALLOC_SMALL
+		} elsif ($offset < 0x80000) {
+		    push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))];
+		} else {
+		    push @dat, [0,0x11,unpack("C4",pack("V",$offset))];
+		}
+		$len += $#{@dat[-1]}+1;
+	    }
+	}
+
+	# allocate stack frame
+	if (my $offset = -8 - $cfa_rsp) {
+	    # but see if frame pointer is among saved registers
+	    if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) {
+	        $fp_off = -8 - $fp_off;
+		allocstack($fp_off-8);
+		$offset -= $fp_off;
+		push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL
+		$len += $#{@dat[-1]}+1;
+	    }
+	    allocstack($offset);
+	}
+	# set up frame pointer
+	my $fp_info = 0;
+	if ($cfa_reg ne "%rsp") {
+	    my $offset = $cfa_off - $cfa_rsp;
+	    ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset";
+	    $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg};
+	    push @dat, [0,3];				# UWOP_SET_FPREG
+	    $len += $#{@dat[-1]}+1;
+	}
+	# save registers
+	foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} }
+			      keys(%saved_regs)) {
+	    next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key);
+	    my $offset = $saved_regs{$key} - $cfa_rsp;
+	    if ($key =~ /%xmm([0-9]+)/) {
+		if ($offset < 0x100000) {
+		    push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))];
+		} else {
+		    push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))];
+		}
+	    } else {
+		if ($offset < 0x80000) {
+		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4,
+				unpack("C2",pack("v",$offset>>3))];
+		} else {
+		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5,
+				unpack("C4",pack("V",$offset))];
+		}
+	    }
+	    $len += $#{@dat[-1]}+1;
+	}
+
+	my @ret;
+	# generate 4-byte descriptor
+	push @ret, ".byte	1,0,".($len/2).",$fp_info";
+	$len += 4;
+	# pad to 8*n
+	unshift @dat, [(0)x((-$len)&7)] if ($len&7);
+	# emit data
+	while(defined(my $row = pop @dat)) {
+	    push @ret, ".byte	". join(",",
+					map { sprintf "0x%02x",$_ } @{$row});
+	}
+
+	return @ret;
+    }
+    sub startproc {
+	return if ($cfa_rsp == -8);
+	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8);
+	%saved_regs = ();
+	return "startproc";
+    }
+    sub endproc {
+	return if ($cfa_rsp == 0);
+	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0);
+	%saved_regs = ();
+	return "endproc";
+    }
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+	    bless $self,$class;
+	    $ret = $self;
+	    undef $self->{value};
+	    my $dir = $1;
+
+	    SWITCH: for ($dir) {
+	    # What is $cfa_rsp? Effectively it's difference between %rsp
+	    # value and current CFA, Canonical Frame Address, which is
+	    # why it starts with -8. Recall that CFA is top of caller's
+	    # stack...
+	    /startproc/	&& do {	$dir = startproc(); last; };
+	    /endproc/	&& do {	$dir = endproc();
+				# .cfi_remember_state directives that are not
+				# matched with .cfi_restore_state are
+				# unnecessary.
+				die "unpaired .cfi_remember_state" if (@cfa_stack);
+				last;
+			      };
+	    /def_cfa_register/
+			&& do {	$cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp");
+				$cfa_reg = $$line;
+				last;
+			      };
+	    /def_cfa_offset/
+			&& do {	$cfa_off = -1*eval($$line);
+				$cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /adjust_cfa_offset/
+			&& do { my $val = 1*eval($$line);
+				$cfa_off -= $val;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_rsp -= $val;
+				}
+				last;
+			      };
+	    /def_cfa/	&& do {	if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+				    $cfa_reg = $1;
+				    $cfa_off = -1*eval($2);
+				    $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
+				}
+				last;
+			      };
+	    /push/	&& do {	$dir = undef;
+				$cfa_rsp -= 8;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_off = $cfa_rsp;
+				    $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+				}
+				$saved_regs{$$line} = $cfa_rsp;
+				$self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+				last;
+			      };
+	    /pop/	&& do {	$dir = undef;
+				$cfa_rsp += 8;
+				if ($cfa_reg eq "%rsp") {
+				    $cfa_off = $cfa_rsp;
+				    $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+				}
+				$self->{value} .= ".cfi_restore\t$$line";
+				delete $saved_regs{$$line};
+				last;
+			      };
+	    /cfa_expression/
+			&& do {	$dir = undef;
+				$self->{value} = ".cfi_escape\t" .
+					join(",", map(sprintf("0x%02x", $_),
+						      cfa_expression($$line)));
+				last;
+			      };
+	    /remember_state/
+			&& do {	push @cfa_stack,
+				     [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs];
+				last;
+			      };
+	    /restore_state/
+			&& do {	     ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs)
+				= @{pop @cfa_stack};
+				last;
+			      };
+	    /offset/	&& do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) {
+				    $saved_regs{$1} = 1*eval($2);
+				    $dir = undef if ($1 =~ /%xmm/);
+				}
+				last;
+			      };
+	    /restore/	&& do {	delete $saved_regs{$$line}; last; };
+	    /end_prologue/
+			&& do {	$dir = undef;
+				$self->{win64} = ".endprolog";
+				last;
+			      };
+	    /epilogue/	&& do {	$dir = undef;
+				$self->{win64} = ".epilogue";
+				$self->{value} = join("\n",
+						      map { ".cfi_restore\t$_" }
+						      sort keys(%saved_regs));
+				%saved_regs = ();
+				last;
+			      };
+	    }
+
+	    $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+	    $$line = "";
+	}
+
+	return $ret;
+    }
+    sub out {
+	my $self = shift;
+	return $self->{value} if ($dwarf);
+
+	if ($win64 and $current_function->{unwind}
+		   and my $ret = $self->{win64}) {
+	    my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/)  ? ($',    $cfa_off)
+							: ("rsp", $cfa_rsp);
+	    my $fname = $current_function->{name};
+
+	    if ($ret eq ".endprolog") {
+		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
+		$saved_regs{"%rsi"} = 8;
+
+		push @pdata_seg,
+		    ".rva	.LSEH_begin_${fname}",
+		    ".rva	.LSEH_body_${fname}",
+		    ".rva	.LSEH_info_${fname}_prologue","";
+		push @xdata_seg,
+		    ".LSEH_info_${fname}_prologue:",
+		    ".byte	1,0,5,0x0b",	# 5 unwind codes, %r11 is FP
+		    ".byte	0,0x74,1,0",	# %rdi at 8(%rsp)
+		    ".byte	0,0x64,2,0",	# %rsi at 16(%rsp)
+		    ".byte	0,0x03",	# set frame pointer
+		    ".byte	0,0"		# padding
+		    ;
+		push @pdata_seg,
+		    ".rva	.LSEH_body_${fname}",
+		    ".rva	.LSEH_epilogue_${fname}",
+		    ".rva	.LSEH_info_${fname}_body","";
+		push @xdata_seg,".LSEH_info_${fname}_body:", xdata();
+		$ret  = "${decor}SEH_body_${fname}:";
+		$ret .= ":" if ($masm); $ret .= "\n";
+	    } elsif ($ret eq ".epilogue") {
+		%saved_regs = ();
+		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
+		$saved_regs{"%rsi"} = 8;
+		$cfa_rsp = $cfa_off;
+
+		push @pdata_seg,
+		    ".rva	.LSEH_epilogue_${fname}",
+		    ".rva	.LSEH_end_${fname}",
+		    ".rva	.LSEH_info_${fname}_epilogue","";
+		push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), "";
+		$ret  = "${decor}SEH_epilogue_${fname}:";
+		$ret .= ":" if ($masm); $ret .= "\n";
+		if ($gas) {
+		    $ret .= "	mov	".(0-$off)."(%$reg),%rdi\n";
+		    $ret .= "	mov	".(8-$off)."(%$reg),%rsi\n";
+		} else {
+		    $ret .= "	mov	rdi,QWORD$PTR\[".(0-$off)."+$reg\]";
+		    $ret .= "	;WIN64 epilogue\n";
+		    $ret .= "	mov	rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n";
+		}
+	    }
+	    return $ret;
+	}
+	return;
+    }
+}
+{ package directive;	# pick up directives, which start with .
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+	my	$dir;
+
+	# chain-call to cfi_directive
+	$ret = cfi_directive->re($line) and return $ret;
+
+	if ($$line =~ /^\s*(\.\w+)/) {
+	    bless $self,$class;
+	    $dir = $1;
+	    $ret = $self;
+	    undef $self->{value};
+	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
+
+	    SWITCH: for ($dir) {
+		/\.global|\.globl|\.extern/
+			    && do { $globals{$$line} = $prefix . $$line;
+				    $$line = $globals{$$line} if ($prefix);
+				    last;
+				  };
+		/\.type/    && do { my ($sym,$type,$narg,$unwind) = split(',',$$line);
+				    if ($type eq "\@function") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{abi}  = "svr4";
+					$current_function->{narg} = $narg;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+					$current_function->{unwind} = $unwind;
+				    } elsif ($type eq "\@abi-omnipotent") {
+					undef $current_function;
+					$current_function->{name} = $sym;
+					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
+				    }
+				    $$line =~ s/\@abi\-omnipotent/\@function/;
+				    $$line =~ s/\@function.*/\@function/;
+				    last;
+				  };
+		/\.asciz/   && do { if ($$line =~ /^"(.*)"$/) {
+					$dir  = ".byte";
+					$$line = join(",",unpack("C*",$1),0);
+				    }
+				    last;
+				  };
+		/\.rva|\.long|\.quad/
+			    && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
+				    $$line =~ s/\.L/$decor/g;
+				    last;
+				  };
+	    }
+
+	    if ($gas) {
+		$self->{value} = $dir . "\t" . $$line;
+
+		if ($dir =~ /\.extern/) {
+		    $self->{value} = ""; # swallow extern
+		} elsif (!$elf && $dir =~ /\.type/) {
+		    $self->{value} = "";
+		    $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
+				(defined($globals{$1})?".scl 2;":".scl 3;") .
+				"\t.type 32;\t.endef"
+				if ($win64 && $$line =~ /([^,]+),\@function/);
+		} elsif ($dir =~ /\.size/) {
+		    $self->{value} = "" if (!$elf);
+		    if ($dwarf and my $endproc = cfi_directive::endproc()) {
+			$self->{value} = ".cfi_$endproc\n$self->{value}";
+		    } elsif (!$elf && defined($current_function)) {
+			$self->{value} .= "${decor}SEH_end_$current_function->{name}:"
+				if ($win64 && $current_function->{abi} eq "svr4");
+			undef $current_function;
+		    }
+		} elsif (!$elf && $dir =~ /\.align/) {
+		    $self->{value} = ".p2align\t" . (log($$line)/log(2));
+		} elsif ($dir eq ".section") {
+		    $current_segment=$$line;
+		    if (!$elf && $current_segment eq ".init") {
+			if	($flavour eq "macosx")	{ $self->{value} = ".mod_init_func"; }
+			elsif	($flavour eq "mingw64")	{ $self->{value} = ".section\t.ctors"; }
+		    }
+		} elsif ($dir =~ /\.(text|data)/) {
+		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
+		}
+		$$line = "";
+		return $self;
+	    }
+
+	    # non-gas case or nasm/masm
+	    SWITCH: for ($dir) {
+		/\.text/    && do { my $v=undef;
+				    if ($nasm) {
+					$v="section	.text code align=64\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = ".text\$";
+					$v.="$current_segment\tSEGMENT ";
+					$v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
+					$v.=" 'CODE'";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.data/    && do { my $v=undef;
+				    if ($nasm) {
+					$v="section	.data data align=8\n";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT";
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.section/ && do { my $v=undef;
+				    $$line =~ s/([^,]*).*/$1/;
+				    $$line = ".CRT\$XCU" if ($$line eq ".init");
+				    if ($nasm) {
+					$v="section	$$line";
+					if ($$line=~/\.([px])data/) {
+					    $v.=" rdata align=";
+					    $v.=$1 eq "p"? 4 : 8;
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $v.=" rdata align=8";
+					}
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$v.="$$line\tSEGMENT";
+					if ($$line=~/\.([px])data/) {
+					    $v.=" READONLY";
+					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
+					} elsif ($$line=~/\.CRT\$/i) {
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
+					}
+				    }
+				    $current_segment = $$line;
+				    $self->{value} = $v;
+				    last;
+				  };
+		/\.extern/  && do { $self->{value}  = "EXTERN\t".$$line;
+				    $self->{value} .= ":NEAR" if ($masm);
+				    last;
+				  };
+		/\.globl|.global/
+			    && do { $self->{value}  = $masm?"PUBLIC":"global";
+				    $self->{value} .= "\t".$$line;
+				    last;
+				  };
+		/\.size/    && do { if (defined($current_function)) {
+					undef $self->{value};
+					if ($current_function->{abi} eq "svr4") {
+					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
+					    $self->{value}.=":\n" if($masm);
+					}
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
+					undef $current_function;
+				    }
+				    last;
+				  };
+		/\.align/   && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
+				    $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
+				    last;
+				  };
+		/\.(value|long|rva|quad)/
+			    && do { my $sz  = substr($1,0,1);
+				    my @arr = split(/,\s*/,$$line);
+				    my $last = pop(@arr);
+				    my $conv = sub  {	my $var=shift;
+							$var=~s/^(0b[0-1]+)/oct($1)/eig;
+							$var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
+							if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
+							{ $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
+							$var;
+						    };
+
+				    $sz =~ tr/bvlrq/BWDDQ/;
+				    $self->{value} = "\tD$sz\t";
+				    for (@arr) { $self->{value} .= &$conv($_).","; }
+				    $self->{value} .= &$conv($last);
+				    last;
+				  };
+		/\.byte/    && do { my @str=split(/,\s*/,$$line);
+				    map(s/(0b[0-1]+)/oct($1)/eig,@str);
+				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+				    while ($#str>15) {
+					$self->{value}.="DB\t"
+						.join(",",@str[0..15])."\n";
+					foreach (0..15) { shift @str; }
+				    }
+				    $self->{value}.="DB\t"
+						.join(",",@str) if (@str);
+				    last;
+				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
+	    }
+	    $$line = "";
+	}
+
+	$ret;
+    }
+    sub out {
+	my $self = shift;
+	$self->{value};
+    }
+}
+
+# Upon initial x86_64 introduction SSE>2 extensions were not introduced
+# yet. In order not to be bothered by tracing exact assembler versions,
+# but at the same time to provide a bare security minimum of AES-NI, we
+# hard-code some instructions. Extensions past AES-NI on the other hand
+# are traced by examining assembler version in individual perlasm
+# modules...
+
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+sub rex {
+ my $opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @$opcode,($rex|0x40) if ($rex);
+}
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	my $imm=$1;
+	my $src=$2;
+	my $dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdseed = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$dst,8);
+	push @opcode,0x0f,0xc7,0xf8|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
+# are using only two instructions hand-code them in order to be excused
+# from chasing assembler versions...
+
+sub rxb {
+ my $opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @$opcode,$rxb;
+}
+
+my $vprotd = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $vprotq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc3;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+# Intel Control-flow Enforcement Technology extension. All functions and
+# indirect branch targets will have to start with this instruction...
+# However, it should not be used in functions' prologues explicitly, as
+# it's added automatically [and in the right spot]. Which leaves only
+# non-function indirect branch targets, such as in a case-like dispatch
+# table, as application area.
+
+my $endbr64 = sub {
+    (0xf3,0x0f,0x1e,0xfa);
+};
+
+########################################################################
+
+if ($nasm) {
+    print <<___;
+default	rel
+%define XMMWORD
+%define YMMWORD
+%define ZMMWORD
+___
+} elsif ($masm) {
+    print <<___;
+OPTION	DOTNAME
+___
+}
+
+sub process {
+    my $line = shift;
+
+    $line =~ s|\R$||;           # Better chomp
+
+    $line =~ s|[#!].*$||;	# get rid of asm-style comments...
+    $line =~ s|/\*.*\*/||;	# ... and C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning
+    $line =~ s|\s+$||;		# ... and at the end
+
+    if (my $label=label->re(\$line))	{ print $label->out(); }
+
+    if (my $directive=directive->re(\$line)) {
+	printf "%s",$directive->out();
+    } elsif (my $opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+
+	if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	my @args;
+	ARGUMENT: while (1) {
+	    my $arg;
+
+	    ($arg=register->re(\$line, $opcode))||
+	    ($arg=const->re(\$line))		||
+	    ($arg=ea->re(\$line, $opcode))	||
+	    ($arg=expr->re(\$line, $opcode))	||
+	    last ARGUMENT;
+
+	    push @args,$arg;
+
+	    last ARGUMENT if ($line !~ /^,/);
+
+	    $line =~ s/^,\s*//;
+	} # ARGUMENT:
+
+	if ($#args>=0) {
+	    my $insn;
+	    my $sz=$opcode->size();
+
+	    if ($gas) {
+		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
+	    } else {
+		$insn = $opcode->out();
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
+		    if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
+		@args = reverse(@args);
+		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
+	    }
+	} else {
+	    printf "\t%s",$opcode->out();
+	}
+    }
+
+    print $line,"\n";
+}
+
+while(<>) { process($_); }
+
+map { process($_) } @pdata_seg if ($win64);
+map { process($_) } @xdata_seg if ($win64);
+
+# platform-specific epilogue
+if ($masm) {
+    print "\n$current_segment\tENDS\n"	if ($current_segment);
+    print "END\n";
+} elsif ($elf) {
+    # -fcf-protection segment, snatched from compiler -S output
+    my $align = ($flavour =~ /elf32/) ? 4 : 8;
+    print <<___;
+
+.section	.note.GNU-stack,"",\@progbits
+.section	.note.gnu.property,"a",\@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	$align
+2:
+___
+}
+
+close STDOUT;
+
+#################################################
+# Cross-reference x86_64 ABI "card"
+#
+# 		Unix		Win64
+# %rax		*		*
+# %rbx		-		-
+# %rcx		#4		#1
+# %rdx		#3		#2
+# %rsi		#2		-
+# %rdi		#1		-
+# %rbp		-		-
+# %rsp		-		-
+# %r8		#5		#3
+# %r9		#6		#4
+# %r10		*		*
+# %r11		*		*
+# %r12		-		-
+# %r13		-		-
+# %r14		-		-
+# %r15		-		-
+#
+# (*)	volatile register
+# (-)	preserved by callee
+# (#)	Nth argument, volatile
+#
+# In Unix terms top of stack is argument transfer area for arguments
+# which could not be accommodated in registers. Or in other words 7th
+# [integer] argument resides at 8(%rsp) upon function entry point.
+# 128 bytes above %rsp constitute a "red zone" which is not touched
+# by signal handlers and can be used as temporal storage without
+# allocating a frame.
+#
+# In Win64 terms N*8 bytes on top of stack is argument transfer area,
+# which belongs to/can be overwritten by callee. N is the number of
+# arguments passed to callee, *but* not less than 4! This means that
+# upon function entry point 5th argument resides at 40(%rsp), as well
+# as that 32 bytes from 8(%rsp) can always be used as temporal
+# storage [without allocating a frame]. One can actually argue that
+# one can assume a "red zone" above stack pointer under Win64 as well.
+# Point is that at apparently no occasion Windows kernel would alter
+# the area above user stack pointer in true asynchronous manner...
+#
+# All the above means that if assembler programmer adheres to Unix
+# register and stack layout, but disregards the "red zone" existence,
+# it's possible to use following prologue and epilogue to "gear" from
+# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
+#
+# omnipotent_function:
+# ifdef WIN64
+#	movq	%rdi,8(%rsp)
+#	movq	%rsi,16(%rsp)
+#	movq	%rcx,%rdi	; if 1st argument is actually present
+#	movq	%rdx,%rsi	; if 2nd argument is actually ...
+#	movq	%r8,%rdx	; if 3rd argument is ...
+#	movq	%r9,%rcx	; if 4th argument ...
+#	movq	40(%rsp),%r8	; if 5th ...
+#	movq	48(%rsp),%r9	; if 6th ...
+# endif
+#	...
+# ifdef WIN64
+#	movq	8(%rsp),%rdi
+#	movq	16(%rsp),%rsi
+# endif
+#	ret
+#
+#################################################
+# Win64 SEH, Structured Exception Handling.
+#
+# Unlike on Unix systems(*) lack of Win64 stack unwinding information
+# has undesired side-effect at run-time: if an exception is raised in
+# assembler subroutine such as those in question (basically we're
+# referring to segmentation violations caused by malformed input
+# parameters), the application is briskly terminated without invoking
+# any exception handlers, most notably without generating memory dump
+# or any user notification whatsoever. This poses a problem. It's
+# possible to address it by registering custom language-specific
+# handler that would restore processor context to the state at
+# subroutine entry point and return "exception is not handled, keep
+# unwinding" code. Writing such handler can be a challenge... But it's
+# doable, though requires certain coding convention. Consider following
+# snippet:
+#
+# .type	function,@function
+# function:
+#	movq	%rsp,%rax	# copy rsp to volatile register
+#	pushq	%r15		# save non-volatile registers
+#	pushq	%rbx
+#	pushq	%rbp
+#	movq	%rsp,%r11
+#	subq	%rdi,%r11	# prepare [variable] stack frame
+#	andq	$-64,%r11
+#	movq	%rax,0(%r11)	# check for exceptions
+#	movq	%r11,%rsp	# allocate [variable] stack frame
+#	movq	%rax,0(%rsp)	# save original rsp value
+# magic_point:
+#	...
+#	movq	0(%rsp),%rcx	# pull original rsp value
+#	movq	-24(%rcx),%rbp	# restore non-volatile registers
+#	movq	-16(%rcx),%rbx
+#	movq	-8(%rcx),%r15
+#	movq	%rcx,%rsp	# restore original rsp
+# magic_epilogue:
+#	ret
+# .size function,.-function
+#
+# The key is that up to magic_point copy of original rsp value remains
+# in chosen volatile register and no non-volatile register, except for
+# rsp, is modified. While past magic_point rsp remains constant till
+# the very end of the function. In this case custom language-specific
+# exception handler would look like this:
+#
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+# {	ULONG64 *rsp = (ULONG64 *)context->Rax;
+#	ULONG64  rip = context->Rip;
+#
+#	if (rip >= magic_point)
+#	{   rsp = (ULONG64 *)context->Rsp;
+#	    if (rip < magic_epilogue)
+#	    {	rsp = (ULONG64 *)rsp[0];
+#		context->Rbp = rsp[-3];
+#		context->Rbx = rsp[-2];
+#		context->R15 = rsp[-1];
+#	    }
+#	}
+#	context->Rsp = (ULONG64)rsp;
+#	context->Rdi = rsp[1];
+#	context->Rsi = rsp[2];
+#
+#	memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
+#	RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
+#		dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
+#		&disp->HandlerData,&disp->EstablisherFrame,NULL);
+#	return ExceptionContinueSearch;
+# }
+#
+# It's appropriate to implement this handler in assembler, directly in
+# function's module. In order to do that one has to know members'
+# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
+# values. Here they are:
+#
+#	CONTEXT.Rax				120
+#	CONTEXT.Rcx				128
+#	CONTEXT.Rdx				136
+#	CONTEXT.Rbx				144
+#	CONTEXT.Rsp				152
+#	CONTEXT.Rbp				160
+#	CONTEXT.Rsi				168
+#	CONTEXT.Rdi				176
+#	CONTEXT.R8				184
+#	CONTEXT.R9				192
+#	CONTEXT.R10				200
+#	CONTEXT.R11				208
+#	CONTEXT.R12				216
+#	CONTEXT.R13				224
+#	CONTEXT.R14				232
+#	CONTEXT.R15				240
+#	CONTEXT.Rip				248
+#	CONTEXT.Xmm6				512
+#	sizeof(CONTEXT)				1232
+#	DISPATCHER_CONTEXT.ControlPc		0
+#	DISPATCHER_CONTEXT.ImageBase		8
+#	DISPATCHER_CONTEXT.FunctionEntry	16
+#	DISPATCHER_CONTEXT.EstablisherFrame	24
+#	DISPATCHER_CONTEXT.TargetIp		32
+#	DISPATCHER_CONTEXT.ContextRecord	40
+#	DISPATCHER_CONTEXT.LanguageHandler	48
+#	DISPATCHER_CONTEXT.HandlerData		56
+#	UNW_FLAG_NHANDLER			0
+#	ExceptionContinueSearch			1
+#
+# In order to tie the handler to the function one has to compose
+# couple of structures: one for .xdata segment and one for .pdata.
+#
+# UNWIND_INFO structure for .xdata segment would be
+#
+# function_unwind_info:
+#	.byte	9,0,0,0
+#	.rva	handler
+#
+# This structure designates exception handler for a function with
+# zero-length prologue, no stack frame or frame register.
+#
+# To facilitate composing of .pdata structures, auto-generated "gear"
+# prologue copies rsp value to rax and denotes next instruction with
+# .LSEH_begin_{function_name} label. This essentially defines the SEH
+# styling rule mentioned in the beginning. Position of this label is
+# chosen in such manner that possible exceptions raised in the "gear"
+# prologue would be accounted to caller and unwound from latter's frame.
+# End of function is marked with respective .LSEH_end_{function_name}
+# label. To summarize, .pdata segment would contain
+#
+#	.rva	.LSEH_begin_function
+#	.rva	.LSEH_end_function
+#	.rva	function_unwind_info
+#
+# Reference to function_unwind_info from .xdata segment is the anchor.
+# In case you wonder why references are 32-bit .rvas and not 64-bit
+# .quads. References put into these two segments are required to be
+# *relative* to the base address of the current binary module, a.k.a.
+# image base. No Win64 module, be it .exe or .dll, can be larger than
+# 2GB and thus such relative references can be and are accommodated in
+# 32 bits.
+#
+# Having reviewed the example function code, one can argue that "movq
+# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
+# rax would contain an undefined value. If this "offends" you, use
+# another register and refrain from modifying rax till magic_point is
+# reached, i.e. as if it was a non-volatile register. If more registers
+# are required prior [variable] frame setup is completed, note that
+# nobody says that you can have only one "magic point." You can
+# "liberate" non-volatile registers by denoting last stack off-load
+# instruction and reflecting it in finer grade unwind logic in handler.
+# After all, isn't it why it's called *language-specific* handler...
+#
+# SE handlers are also involved in unwinding stack when executable is
+# profiled or debugged. Profiling implies additional limitations that
+# are too subtle to discuss here. For now it's sufficient to say that
+# in order to simplify handlers one should either a) offload original
+# %rsp to stack (like discussed above); or b) if you have a register to
+# spare for frame pointer, choose volatile one.
+#
+# (*)	Note that we're talking about run-time, not debug-time. Lack of
+#	unwind information makes debugging hard on both Windows and
+#	Unix. "Unlike" refers to the fact that on Unix signal handler
+#	will always be invoked, core dumped and appropriate exit code
+#	returned to parent (for user notification).
+#
+########################################################################
+# As of May 2020 an alternative approach that works with both exceptions
+# and debugging/profiling was implemented by re-purposing DWARF .cfi
+# annotations even for Win64 unwind tables' generation. Unfortunately,
+# but not really unexpectedly, it imposes additional limitations on
+# coding style. Probably most significant limitation is that frame
+# pointer has to be at 16*n distance from stack pointer at the exit
+# from prologue. But first things first. There are two additional
+# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue,
+# that need to be added to all functions marked with additional .type
+# tag (see example below). There are "do's and don'ts" for prologue
+# and epilogue. It shouldn't come as surprise that in prologue one may
+# not modify non-volatile registers, but one may not modify %r11 either.
+# This is because it's used as temporary frame pointer(*). There is one
+# exception to this rule, and it's setting up frame pointer that is
+# non-volatile or %r11. But it must be last instruction in the prologue.
+# Constraints for epilogue, or rather on its boundary, depend on whether
+# the frame is fixed- or variable-length. In fixed-frame subroutine
+# stack pointer has to be restored in the last instruction prior the
+# .cfi_epilogue directive. If it's variable-frame subroutine, and a
+# non-volatile register was used as frame pointer, then last instruction
+# prior the directive has to restore its original value. This means that
+# final stack pointer adjustment would have to be pushed past the
+# directive. Normally this would render the epilogue non-unwindable, so
+# special care has to be taken. To resolve the dilemma, copy frame
+# pointer to a volatile register in advance. To give an example:
+#
+# .type	rbp_as_frame_pointer,\@function,3,"unwind"  # mind extra tag!
+# rbp_as_frame_pointer:
+# .cfi_startproc
+#	push	%rbp
+# .cfi_push	%rbp
+#	push	%rbx
+# .cfi_push	%rbx
+# 	mov	%rsp,%rbp	# last instruction in prologue
+# .cfi_def_cfa_register	%rbp	# %rsp-%rbp has to be 16*n, e.g. 16*0
+# .cfi_end_prologue
+#	sub	\$40,%rsp
+#	and	\$-64,%rsp
+#	...
+#	mov	%rbp,%r11
+# .cfi_def_cfa_register	%r11	# copy frame pointer to volatile %r11
+#	mov	0(%rbp),%rbx
+#	mov	8(%rbp),%rbp	# last instruction prior epilogue
+# .cfi_epilogue			# may not change %r11 in epilogue
+#	lea	16(%r11),%rsp
+#	ret
+# .cfi_endproc
+# .size	rbp_as_frame_pointer,.-rbp_as_frame_pointer
+#
+# To give an example of fixed-frame subroutine for reference:
+#
+# .type	fixed_frame,\@function,3,"unwind"           # mind extra tag!
+# fixed_frame:
+# .cfi_startproc
+#	push	%rbp
+# .cfi_push	%rbp
+#	push	%rbx
+# .cfi_push	%rbx
+#	sub	\$40,%rsp
+# .cfi_adjust_cfa_offset 40
+# .cfi_end_prologue
+#	...
+#	mov	40(%rsp),%rbx
+#	mov	48(%rsp),%rbp
+#	lea	56(%rsp),%rsp
+# .cfi_adjust_cfa_offset -56
+# .cfi_epilogue
+#	ret
+# .cfi_endproc
+# .size	fixed_frame,.-fixed_frame
+#
+# As for epilogue itself, one can only work on non-volatile registers.
+# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi.
+#
+# On a final note, mixing old-style and modernized subroutines in the
+# same file takes some trickery. Ones of the new kind have to appear
+# after old-style ones. This has everything to do with the fact that
+# entries in the .pdata segment have to appear in strictly same order
+# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION
+# structures get mechanically appended to whatever existing .pdata.
+#
+# (*)	Just in case, why %r11 and not %rax. This has everything to do
+#	with the way UNWIND_INFO is, one just can't designate %rax as
+#	frame pointer.
diff --git a/crypto/blst_src/blst_t.hpp b/crypto/blst_src/blst_t.hpp
new file mode 100644
index 00000000000..1b150da30ce
--- /dev/null
+++ b/crypto/blst_src/blst_t.hpp
@@ -0,0 +1,538 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef __BLST_T_HPP__
+#define __BLST_T_HPP__
+
+/*
+ * These templates, blst_384_t and blst_256_t, allow to instantiate slim
+ * C++ shims to blst assembly with arbitrary moduli. Well, not literally
+ * arbitrary, as there are limitations. Most notably blst_384_t can not
+ * actually accommodate 384-bit moduli, only 383 and narrower. This is
+ * because of ct_inverse_mod_383's limitation. Though if you abstain
+ * from the reciprocal() method, even 384-bit modulus would work. As for
+ * blst_256_t, modulus has to be not larger than 2^256-2^192-1.
+ */
+
+#ifdef __GNUC__
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+extern "C" {
+#include "vect.h"
+}
+#include "bytes.h"
+
+#undef launder // avoid conflict with C++ >=17
+
+#ifdef __GNUC__
+# pragma GCC diagnostic pop
+#endif
+
+static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t n)
+{
+    const unsigned int nbits = sizeof(inp[0])*8;
+    unsigned int align = 0;
+    limb_t top = inp[n-1];
+
+    if (top) {
+        while ((top >> (nbits-1)) == 0)
+            top <<= 1, align++;
+    }
+    if (align) {
+        while (--n) {
+            limb_t next = inp[n-1];
+            out[n] = top | next >> (nbits-align);
+            top = next << align;
+        }
+        out[0] = top;
+    } else {
+        for (size_t i = 0; i < n-1; i++)
+             out[i] = inp[i];
+        out[n-1] = top;
+    }
+}
+
+constexpr static inline size_t vec_nbits(const limb_t *inp, size_t n)
+{
+    const unsigned int nbits = sizeof(inp[0])*8;
+    size_t align = 0;
+    limb_t top = inp[n-1];
+
+    while ((top >> (nbits-1)) == 0)
+        top <<= 1, align++;
+
+    return n*nbits - align;
+}
+
+template<const vec384 MOD, const limb_t M0, const vec384 RR, const vec384 ONE>
+class blst_384_t {
+private:
+    vec384 val;
+
+    inline operator const limb_t*() const           { return val;    }
+    inline operator limb_t*()                       { return val;    }
+    inline limb_t& operator[](size_t i)             { return val[i]; }
+    inline const limb_t& operator[](size_t i) const { return val[i]; }
+
+public:
+    static const size_t n = sizeof(vec384)/sizeof(limb_t);
+    static const size_t nbits = vec_nbits(MOD, n);
+    typedef byte pow_t[384/8];
+
+    inline blst_384_t() {}
+    inline blst_384_t(const vec384 p, bool align = false)
+    {
+        if (align)
+            vec_left_align(val, p, n);
+        else
+            vec_copy(val, p, sizeof(val));
+    }
+    inline blst_384_t(uint64_t a)
+    {
+        vec_zero(val, sizeof(val));
+        val[0] = a;
+        if (a) to();
+    }
+    inline blst_384_t(int a) : blst_384_t((uint64_t)a) {}
+
+    inline void to_scalar(pow_t& scalar) const
+    {
+        const union {
+            long one;
+            char little;
+        } is_endian = { 1 };
+
+        if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) {
+            from_mont_384((limb_t *)scalar, val, MOD, M0);
+        } else {
+            vec384 out;
+            from_mont_384(out, val, MOD, M0);
+            le_bytes_from_limbs(scalar, out, sizeof(pow_t));
+            vec_zero(out, sizeof(out));
+        }
+    }
+
+    static inline const blst_384_t& one()
+    {   return *reinterpret_cast<const blst_384_t*>(ONE);   }
+
+    inline blst_384_t& to()
+    {   mul_mont_384(val, RR, val, MOD, M0);        return *this;   }
+    inline blst_384_t& from()
+    {   from_mont_384(val, val, MOD, M0);           return *this;   }
+
+    inline void store(limb_t *p) const
+    {   vec_copy(p, val, sizeof(val));   }
+
+    inline blst_384_t& operator+=(const blst_384_t& b)
+    {   add_mod_384(val, val, b, MOD);              return *this;   }
+    friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b)
+    {
+        blst_384_t ret;
+        add_mod_384(ret, a, b, MOD);
+        return ret;
+    }
+
+    inline blst_384_t& operator<<=(unsigned l)
+    {   lshift_mod_384(val, val, l, MOD);           return *this;   }
+    friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l)
+    {
+        blst_384_t ret;
+        lshift_mod_384(ret, a, l, MOD);
+        return ret;
+    }
+
+    inline blst_384_t& operator>>=(unsigned r)
+    {   rshift_mod_384(val, val, r, MOD);           return *this;   }
+    friend inline blst_384_t operator>>(blst_384_t a, unsigned r)
+    {
+        blst_384_t ret;
+        rshift_mod_384(ret, a, r, MOD);
+        return ret;
+    }
+
+    inline blst_384_t& operator-=(const blst_384_t& b)
+    {   sub_mod_384(val, val, b, MOD);              return *this;   }
+    friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b)
+    {
+        blst_384_t ret;
+        sub_mod_384(ret, a, b, MOD);
+        return ret;
+    }
+
+    inline blst_384_t& cneg(bool flag)
+    {   cneg_mod_384(val, val, flag, MOD);          return *this;   }
+    friend inline blst_384_t cneg(const blst_384_t& a, bool flag)
+    {
+        blst_384_t ret;
+        cneg_mod_384(ret, a, flag, MOD);
+        return ret;
+    }
+    friend inline blst_384_t operator-(const blst_384_t& a)
+    {
+        blst_384_t ret;
+        cneg_mod_384(ret, a, true, MOD);
+        return ret;
+    }
+
+    inline blst_384_t& operator*=(const blst_384_t& a)
+    {
+        if (this == &a) sqr_mont_384(val, val, MOD, M0);
+        else            mul_mont_384(val, val, a, MOD, M0);
+        return *this;
+    }
+    friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b)
+    {
+        blst_384_t ret;
+        if (&a == &b)   sqr_mont_384(ret, a, MOD, M0);
+        else            mul_mont_384(ret, a, b, MOD, M0);
+        return ret;
+    }
+
+    // simplified exponentiation, but mind the ^ operator's precedence!
+    friend inline blst_384_t operator^(const blst_384_t& a, unsigned p)
+    {
+        if (p < 2) {
+            abort();
+        } else if (p == 2) {
+            blst_384_t ret;
+            sqr_mont_384(ret, a, MOD, M0);
+            return ret;
+        } else {
+            blst_384_t ret;
+            sqr_mont_384(ret, a, MOD, M0);
+            for (p -= 2; p--;)
+                mul_mont_384(ret, ret, a, MOD, M0);
+            return ret;
+        }
+    }
+    inline blst_384_t& operator^=(unsigned p)
+    {
+        if (p < 2) {
+            abort();
+        } else if (p == 2) {
+            sqr_mont_384(val, val, MOD, M0);
+            return *this;
+        }
+        return *this = *this^p;
+    }
+    inline blst_384_t operator()(unsigned p)
+    {   return *this^p;   }
+    friend inline blst_384_t sqr(const blst_384_t& a)
+    {   return a^2;   }
+
+    inline bool is_zero() const
+    {   return vec_is_zero(val, sizeof(val));   }
+
+    inline void zero()
+    {   vec_zero(val, sizeof(val));   }
+
+    blst_384_t reciprocal() const
+    {
+        static const blst_384_t MODx{MOD, true};
+        static const blst_384_t RRx4 = *reinterpret_cast<const blst_384_t*>(RR)<<2;
+        union { vec768 x; vec384 r[2]; } temp;
+
+        ct_inverse_mod_383(temp.x, val, MOD, MODx);
+        redc_mont_384(temp.r[0], temp.x, MOD, M0);
+        mul_mont_384(temp.r[0], temp.r[0], RRx4, MOD, M0);
+
+        return *reinterpret_cast<blst_384_t*>(temp.r[0]);
+    }
+    friend inline blst_384_t operator/(unsigned one, const blst_384_t& a)
+    {
+        if (one == 1)
+            return a.reciprocal();
+        abort();
+    }
+    friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b)
+    {   return a * b.reciprocal();   }
+    inline blst_384_t& operator/=(const blst_384_t& a)
+    {   return *this *= a.reciprocal();   }
+
+#ifndef NDEBUG
+    inline blst_384_t(const char *hexascii)
+    {   limbs_from_hexascii(val, sizeof(val), hexascii); to();   }
+
+    friend inline bool operator==(const blst_384_t& a, const blst_384_t& b)
+    {   return vec_is_equal(a, b, sizeof(vec384));   }
+    friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b)
+    {   return !vec_is_equal(a, b, sizeof(vec384));   }
+
+# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard
+    friend std::ostream& operator<<(std::ostream& os, const blst_384_t& obj)
+    {
+        unsigned char be[sizeof(obj)];
+        char buf[2+2*sizeof(obj)+1], *str = buf;
+
+        be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj));
+
+        *str++ = '0', *str++ = 'x';
+        for (size_t i = 0; i < sizeof(obj); i++)
+            *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]);
+	*str = '\0';
+
+        return os << buf;
+    }
+# endif
+#endif
+};
+
+template<const vec256 MOD, const limb_t M0, const vec256 RR, const vec256 ONE>
+class blst_256_t {
+    vec256 val;
+
+    inline operator const limb_t*() const           { return val;    }
+    inline operator limb_t*()                       { return val;    }
+    inline limb_t& operator[](size_t i)             { return val[i]; }
+    inline const limb_t& operator[](size_t i) const { return val[i]; }
+
+public:
+    static const size_t n = sizeof(vec256)/sizeof(limb_t);
+    static const size_t nbits = vec_nbits(MOD, n);
+    typedef byte pow_t[256/8];
+
+    inline blst_256_t() {}
+    inline blst_256_t(const vec256 p, bool align = false)
+    {
+        if (align)
+            vec_left_align(val, p, n);
+        else
+            vec_copy(val, p, sizeof(val));
+    }
+    inline blst_256_t(uint64_t a)
+    {
+        vec_zero(val, sizeof(val));
+        val[0] = a;
+        if (a) to();
+    }
+    inline blst_256_t(int a) : blst_256_t((uint64_t)a) {}
+
+    inline void to_scalar(pow_t& scalar) const
+    {
+        const union {
+            long one;
+            char little;
+        } is_endian = { 1 };
+
+        if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) {
+            from_mont_256((limb_t *)scalar, val, MOD, M0);
+        } else {
+            vec256 out;
+            from_mont_256(out, val, MOD, M0);
+            le_bytes_from_limbs(scalar, out, sizeof(pow_t));
+            vec_zero(out, sizeof(out));
+        }
+    }
+
+    static inline const blst_256_t& one()
+    {   return *reinterpret_cast<const blst_256_t*>(ONE);   }
+
+    inline blst_256_t& to()
+    {   mul_mont_sparse_256(val, val, RR, MOD, M0); return *this;   }
+    inline blst_256_t& to(const uint64_t a[2*n])
+    {
+        mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0);
+        vec256 lo{0};
+        add_mod_256(lo, lo, (const limb_t*)a, MOD);
+        add_mod_256(val, val, lo, MOD);
+        mul_mont_sparse_256(val, RR, val, MOD, M0);
+
+        return *this;
+    }
+    blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false)
+    {
+        vec_zero(val, sizeof(val));
+
+        vec256 digit, zero{0};
+        size_t rem = (n - 1) % 32 + 1;
+        n -= rem;
+
+        if (le) {
+            limbs_from_le_bytes(val, bytes += n, rem);
+            mul_mont_sparse_256(val, RR, val, MOD, M0);
+            while (n) {
+                limbs_from_le_bytes(digit, bytes -= 32, 32);
+                add_mod_256(digit, digit, zero, MOD);
+                add_mod_256(val, val, digit, MOD);
+                mul_mont_sparse_256(val, RR, val, MOD, M0);
+                n -= 32;
+            }
+        } else {
+            limbs_from_be_bytes(val, bytes, rem);
+            mul_mont_sparse_256(val, RR, val, MOD, M0);
+            bytes += rem;
+            while (n) {
+                limbs_from_be_bytes(digit, bytes, 32);
+                add_mod_256(digit, digit, zero, MOD);
+                add_mod_256(val, val, digit, MOD);
+                mul_mont_sparse_256(val, RR, val, MOD, M0);
+                bytes += 32;
+                n -= 32;
+            }
+        }
+
+        return *this;
+    }
+
+    inline blst_256_t& from()
+    {   from_mont_256(val, val, MOD, M0); return *this;   }
+
+    inline void store(limb_t *p) const
+    {   vec_copy(p, val, sizeof(val));   }
+
+    inline blst_256_t& operator+=(const blst_256_t& b)
+    {   add_mod_256(val, val, b, MOD);              return *this;   }
+    friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b)
+    {
+        blst_256_t ret;
+        add_mod_256(ret, a, b, MOD);
+        return ret;
+    }
+
+    inline blst_256_t& operator<<=(unsigned l)
+    {   lshift_mod_256(val, val, l, MOD);           return *this;   }
+    friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l)
+    {
+        blst_256_t ret;
+        lshift_mod_256(ret, a, l, MOD);
+        return ret;
+    }
+
+    inline blst_256_t& operator>>=(unsigned r)
+    {   lshift_mod_256(val, val, r, MOD);           return *this;   }
+    friend inline blst_256_t operator>>(blst_256_t a, unsigned r)
+    {
+        blst_256_t ret;
+        lshift_mod_256(ret, a, r, MOD);
+        return ret;
+    }
+
+    inline blst_256_t& operator-=(const blst_256_t& b)
+    {   sub_mod_256(val, val, b, MOD);              return *this;   }
+    friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b)
+    {
+        blst_256_t ret;
+        sub_mod_256(ret, a, b, MOD);
+        return ret;
+    }
+
+    inline blst_256_t& cneg(bool flag)
+    {   cneg_mod_256(val, val, flag, MOD);          return *this;   }
+    friend inline blst_256_t cneg(const blst_256_t& a, bool flag)
+    {
+        blst_256_t ret;
+        cneg_mod_256(ret, a, flag, MOD);
+        return ret;
+    }
+    friend inline blst_256_t operator-(const blst_256_t& a)
+    {
+        blst_256_t ret;
+        cneg_mod_256(ret, a, true, MOD);
+        return ret;
+    }
+
+    inline blst_256_t& operator*=(const blst_256_t& a)
+    {
+        if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0);
+        else            mul_mont_sparse_256(val, val, a, MOD, M0);
+        return *this;
+    }
+    friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b)
+    {
+        blst_256_t ret;
+        if (&a == &b)   sqr_mont_sparse_256(ret, a, MOD, M0);
+        else            mul_mont_sparse_256(ret, a, b, MOD, M0);
+        return ret;
+    }
+
+    // simplified exponentiation, but mind the ^ operator's precedence!
+    friend inline blst_256_t operator^(const blst_256_t& a, unsigned p)
+    {
+        if (p < 2) {
+            abort();
+        } else if (p == 2) {
+            blst_256_t ret;
+            sqr_mont_sparse_256(ret, a, MOD, M0);
+            return ret;
+        } else {
+            blst_256_t ret;
+            sqr_mont_sparse_256(ret, a, MOD, M0);
+            for (p -= 2; p--;)
+                mul_mont_sparse_256(ret, ret, a, MOD, M0);
+            return ret;
+        }
+    }
+    inline blst_256_t& operator^=(unsigned p)
+    {
+        if (p < 2) {
+            abort();
+        } else if (p == 2) {
+            sqr_mont_sparse_256(val, val, MOD, M0);
+            return *this;
+        }
+        return *this = *this^p;
+    }
+    inline blst_256_t operator()(unsigned p)
+    {   return *this^p;   }
+    friend inline blst_256_t sqr(const blst_256_t& a)
+    {   return a^2;   }
+
+    inline bool is_zero() const
+    {   return vec_is_zero(val, sizeof(val));   }
+
+    inline void zero()
+    {   vec_zero(val, sizeof(val));   }
+
+    blst_256_t reciprocal() const
+    {
+        static const blst_256_t MODx{MOD, true};
+        union { vec512 x; vec256 r[2]; } temp;
+
+        ct_inverse_mod_256(temp.x, val, MOD, MODx);
+        redc_mont_256(temp.r[0], temp.x, MOD, M0);
+        mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0);
+
+        return *reinterpret_cast<blst_256_t*>(temp.r[0]);
+    }
+    friend inline blst_256_t operator/(int one, const blst_256_t& a)
+    {
+        if (one == 1)
+            return a.reciprocal();
+        abort();
+    }
+    friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b)
+    {   return a * b.reciprocal();   }
+    inline blst_256_t& operator/=(const blst_256_t& a)
+    {   return *this *= a.reciprocal();   }
+
+#ifndef NDEBUG
+    inline blst_256_t(const char *hexascii)
+    {   limbs_from_hexascii(val, sizeof(val), hexascii); to();   }
+
+    friend inline bool operator==(const blst_256_t& a, const blst_256_t& b)
+    {   return vec_is_equal(a, b, sizeof(vec256));   }
+    friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b)
+    {   return !vec_is_equal(a, b, sizeof(vec256));   }
+
+# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard
+    friend std::ostream& operator<<(std::ostream& os, const blst_256_t& obj)
+    {
+        unsigned char be[sizeof(obj)];
+        char buf[2+2*sizeof(obj)+1], *str=buf;
+
+        be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj));
+
+        *str++ = '0', *str++ = 'x';
+        for (size_t i = 0; i < sizeof(obj); i++)
+            *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]);
+	*str = '\0';
+
+        return os << buf;
+    }
+# endif
+#endif
+};
+#endif
diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S
new file mode 100644
index 00000000000..a1a7c5416e0
--- /dev/null
+++ b/crypto/blst_src/build/assembly.S
@@ -0,0 +1,123 @@
+#if defined(__x86_64) || defined(__x86_64__)
+# if defined(__ELF__)
+#  if defined(__BLST_PORTABLE__)
+#   include "elf/sha256-portable-x86_64.s"
+#  else
+#   include "elf/sha256-x86_64.s"
+#  endif
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "elf/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "elf/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "elf/add_mod_384-x86_64.s"
+#  include "elf/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "elf/mulx_mont_384-x86_64.s"
+#   include "elf/mulx_mont_256-x86_64.s"
+#  else
+#   include "elf/mulq_mont_384-x86_64.s"
+#   include "elf/mulq_mont_256-x86_64.s"
+#  endif
+#  include "elf/add_mod_256-x86_64.s"
+#  include "elf/ct_inverse_mod_256-x86_64.s"
+#  include "elf/div3w-x86_64.s"
+#  include "elf/ct_is_square_mod_384-x86_64.s"
+# elif defined(_WIN64) || defined(__CYGWIN__)
+#  if defined(__BLST_PORTABLE__)
+#   include "coff/sha256-portable-x86_64.s"
+#  else
+#   include "coff/sha256-x86_64.s"
+#  endif
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "coff/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "coff/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "coff/add_mod_384-x86_64.s"
+#  include "coff/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "coff/mulx_mont_384-x86_64.s"
+#   include "coff/mulx_mont_256-x86_64.s"
+#  else
+#   include "coff/mulq_mont_384-x86_64.s"
+#   include "coff/mulq_mont_256-x86_64.s"
+#  endif
+#  include "coff/add_mod_256-x86_64.s"
+#  include "coff/ct_inverse_mod_256-x86_64.s"
+#  include "coff/div3w-x86_64.s"
+#  include "coff/ct_is_square_mod_384-x86_64.s"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-x86_64.s"
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "mach-o/ctx_inverse_mod_384-x86_64.s"
+#  else
+#   include "mach-o/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_384-x86_64.s"
+#  include "mach-o/add_mod_384x384-x86_64.s"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  define __sub_mod_384x384 __sub_mont_384x384
+#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#   include "mach-o/mulx_mont_384-x86_64.s"
+#   include "mach-o/mulx_mont_256-x86_64.s"
+#  else
+#   include "mach-o/mulq_mont_384-x86_64.s"
+#   include "mach-o/mulq_mont_256-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_256-x86_64.s"
+#  include "mach-o/ct_inverse_mod_256-x86_64.s"
+#  include "mach-o/div3w-x86_64.s"
+#  include "mach-o/ct_is_square_mod_384-x86_64.s"
+# endif
+#elif defined(__aarch64__)
+# if defined(__ELF__)
+#  include "elf/sha256-armv8.S"
+#  include "elf/ct_inverse_mod_384-armv8.S"
+#  include "elf/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "elf/mul_mont_384-armv8.S"
+#  include "elf/mul_mont_256-armv8.S"
+#  include "elf/add_mod_256-armv8.S"
+#  include "elf/ct_inverse_mod_256-armv8.S"
+#  include "elf/div3w-armv8.S"
+#  include "elf/ct_is_square_mod_384-armv8.S"
+# elif defined(_WIN64)
+#  include "coff/sha256-armv8.S"
+#  include "coff/ct_inverse_mod_384-armv8.S"
+#  include "coff/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "coff/mul_mont_384-armv8.S"
+#  include "coff/mul_mont_256-armv8.S"
+#  include "coff/add_mod_256-armv8.S"
+#  include "coff/ct_inverse_mod_256-armv8.S"
+#  include "coff/div3w-armv8.S"
+#  include "coff/ct_is_square_mod_384-armv8.S"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-armv8.S"
+#  include "mach-o/ct_inverse_mod_384-armv8.S"
+#  include "mach-o/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "mach-o/mul_mont_384-armv8.S"
+#  include "mach-o/mul_mont_256-armv8.S"
+#  include "mach-o/add_mod_256-armv8.S"
+#  include "mach-o/ct_inverse_mod_256-armv8.S"
+#  include "mach-o/div3w-armv8.S"
+#  include "mach-o/ct_is_square_mod_384-armv8.S"
+# endif
+#elif defined(__BLST_NO_ASM__) || \
+      (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4)
+/* inaccurate way to detect a 32-bit processor, but it's close enough */
+#else
+# error "unsupported platform"
+#endif
diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl
new file mode 100755
index 00000000000..90f914578d9
--- /dev/null
+++ b/crypto/blst_src/build/bindings_trim.pl
@@ -0,0 +1,37 @@
+#!/usr/bin/env perl
+
+# read whole file
+while(<>) { push @file, $_; }
+
+# traverse and remove auto-generated PartialEq for chosen types
+for (my $i = 0; $i <= $#file; $i++) {
+    if (@file[$i] =~ m/struct\s+blst_p[12]/) {
+        @file[$i-1] =~ s/,\s*PartialEq//;
+    } elsif (@file[$i] =~ m/struct\s+blst_fp12/) {
+        @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g;
+    } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) {
+        @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g;
+    } elsif (@file[$i] =~ m/struct\s+blst_scalar/) {
+        @file[$i-1] =~ s/,\s*Copy//;
+        @file[$i-1] =~ s/\)/, Zeroize\)/;
+        splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++;
+    } elsif (@file[$i] =~ m/assert_eq!\($/) {
+        @file[++$i] =~ s/unsafe\s*\{\s*&\(\*\(::std::ptr::null::<(\w+)>\(\)\)\)\.(\w+).*\}/offsetof!($1, $2)/;
+    }
+}
+
+print << '___';
+#[cfg(test)]
+macro_rules! offsetof {
+    ($type:ty, $field:tt) => {
+        {
+            let v = <$type>::default();
+            (&v.$field as *const _ as usize) - (&v as *const _ as usize)
+        }
+    };
+}
+___
+# print the file
+print @file;
+
+close STDOUT;
diff --git a/crypto/blst_src/build/coff/add_mod_256-armv8.S b/crypto/blst_src/build/coff/add_mod_256-armv8.S
new file mode 100644
index 00000000000..27b64ef4ca4
--- /dev/null
+++ b/crypto/blst_src/build/coff/add_mod_256-armv8.S
@@ -0,0 +1,397 @@
+.text
+
+.globl	add_mod_256
+
+.def	add_mod_256;
+.type	32;
+.endef
+.p2align	5
+add_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	mul_by_3_mod_256
+
+.def	mul_by_3_mod_256;
+.type	32;
+.endef
+.p2align	5
+mul_by_3_mod_256:
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	lshift_mod_256
+
+.def	lshift_mod_256;
+.type	32;
+.endef
+.p2align	5
+lshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_lshift_mod_256:
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x12,lo
+	csel	x9,x9,x13,lo
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+
+	cbnz	x2,.Loop_lshift_mod_256
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	rshift_mod_256
+
+.def	rshift_mod_256;
+.type	32;
+.endef
+.p2align	5
+rshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_rshift:
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	csel	x12,x12,x8,ne
+	csel	x13,x13,x9,ne
+	csel	x14,x14,x10,ne
+	csel	x15,x15,x11,ne
+	csel	x3,x3,xzr,ne
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,.Loop_rshift
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	cneg_mod_256
+
+.def	cneg_mod_256;
+.type	32;
+.endef
+.p2align	5
+cneg_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x8,x8,x12,eq
+	csel	x9,x9,x13,eq
+	csel	x10,x10,x14,eq
+	stp	x8,x9,[x0]
+	csel	x11,x11,x15,eq
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	sub_mod_256
+
+.def	sub_mod_256;
+.type	32;
+.endef
+.p2align	5
+sub_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	check_mod_256
+
+.def	check_mod_256;
+.type	32;
+.endef
+.p2align	5
+check_mod_256:
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,x1
+
+	ret
+
+
+.globl	add_n_check_mod_256
+
+.def	add_n_check_mod_256;
+.type	32;
+.endef
+.p2align	5
+add_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+
+
+.globl	sub_n_check_mod_256
+
+.def	sub_n_check_mod_256;
+.type	32;
+.endef
+.p2align	5
+sub_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s
new file mode 100644
index 00000000000..f88e6189ca5
--- /dev/null
+++ b/crypto/blst_src/build/coff/add_mod_256-x86_64.s
@@ -0,0 +1,911 @@
+.text	
+
+.globl	add_mod_256
+
+.def	add_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+add_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_add_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_add_mod_256:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loaded_a_add_mod_256:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_add_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_add_mod_256:
+
+
+.globl	mul_by_3_mod_256
+
+.def	mul_by_3_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_3_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_3_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+.LSEH_body_mul_by_3_mod_256:
+
+
+	movq	%rdx,%rcx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rsi,%rdx
+	movq	24(%rsi),%r11
+
+	call	__lshift_mod_256
+	movq	0(%rsp),%r12
+
+	jmp	.Loaded_a_add_mod_256
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_3_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_3_mod_256:
+
+.def	__lshift_mod_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__lshift_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	movq	%r9,%rsi
+	adcq	%r11,%r11
+	sbbq	%r12,%r12
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	cmovcq	%rbx,%r10
+	cmovcq	%rbp,%r11
+
+	.byte	0xf3,0xc3
+
+
+
+.globl	lshift_mod_256
+
+.def	lshift_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+lshift_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_lshift_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+.LSEH_body_lshift_mod_256:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	decl	%edx
+	jnz	.Loop_lshift_mod_256
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r12
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_lshift_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_lshift_mod_256:
+
+
+.globl	rshift_mod_256
+
+.def	rshift_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+rshift_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_rshift_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_rshift_mod_256:
+
+
+	movq	0(%rsi),%rbp
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_rshift_mod_256:
+	movq	%rbp,%r8
+	andq	$1,%rbp
+	movq	0(%rcx),%rax
+	negq	%rbp
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+
+	andq	%rbp,%rax
+	andq	%rbp,%rsi
+	andq	%rbp,%rbx
+	andq	24(%rcx),%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	adcq	%rbx,%r10
+	adcq	%rbp,%r11
+	sbbq	%rax,%rax
+
+	shrq	$1,%r8
+	movq	%r9,%rbp
+	shrq	$1,%r9
+	movq	%r10,%rbx
+	shrq	$1,%r10
+	movq	%r11,%rsi
+	shrq	$1,%r11
+
+	shlq	$63,%rbp
+	shlq	$63,%rbx
+	orq	%r8,%rbp
+	shlq	$63,%rsi
+	orq	%rbx,%r9
+	shlq	$63,%rax
+	orq	%rsi,%r10
+	orq	%rax,%r11
+
+	decl	%edx
+	jnz	.Loop_rshift_mod_256
+
+	movq	%rbp,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_rshift_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_rshift_mod_256:
+
+
+.globl	cneg_mod_256
+
+.def	cneg_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+cneg_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_cneg_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+.LSEH_body_cneg_mod_256:
+
+
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r12,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%r12
+	orq	%r10,%r12
+	orq	%r11,%r12
+	movq	$-1,%rbp
+
+	movq	0(%rcx),%rax
+	cmovnzq	%rbp,%r12
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+	andq	%r12,%rax
+	movq	24(%rcx),%rbp
+	andq	%r12,%rsi
+	andq	%r12,%rbx
+	andq	%r12,%rbp
+
+	subq	%r8,%rax
+	sbbq	%r9,%rsi
+	sbbq	%r10,%rbx
+	sbbq	%r11,%rbp
+
+	orq	%rdx,%rdx
+
+	cmovzq	%r8,%rax
+	cmovzq	%r9,%rsi
+	movq	%rax,0(%rdi)
+	cmovzq	%r10,%rbx
+	movq	%rsi,8(%rdi)
+	cmovzq	%r11,%rbp
+	movq	%rbx,16(%rdi)
+	movq	%rbp,24(%rdi)
+
+	movq	0(%rsp),%r12
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_cneg_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_cneg_mod_256:
+
+
+.globl	sub_mod_256
+
+.def	sub_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+sub_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sub_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_sub_mod_256:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_sub_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sub_mod_256:
+
+
+.globl	check_mod_256
+
+.def	check_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+check_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_check_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+	movq	%rax,%r8
+	orq	%r9,%rax
+	orq	%r10,%rax
+	orq	%r11,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+	movq	$1,%rdx
+	cmpq	$0,%rax
+	cmovneq	%rdx,%rax
+	andq	%rsi,%rax
+.LSEH_epilogue_check_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_check_mod_256:
+
+
+.globl	add_n_check_mod_256
+
+.def	add_n_check_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+add_n_check_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_add_n_check_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_add_n_check_mod_256:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_add_n_check_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_add_n_check_mod_256:
+
+
+.globl	sub_n_check_mod_256
+
+.def	sub_n_check_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+sub_n_check_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sub_n_check_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_sub_n_check_mod_256:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_sub_n_check_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sub_n_check_mod_256:
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_add_mod_256
+.rva	.LSEH_body_add_mod_256
+.rva	.LSEH_info_add_mod_256_prologue
+
+.rva	.LSEH_body_add_mod_256
+.rva	.LSEH_epilogue_add_mod_256
+.rva	.LSEH_info_add_mod_256_body
+
+.rva	.LSEH_epilogue_add_mod_256
+.rva	.LSEH_end_add_mod_256
+.rva	.LSEH_info_add_mod_256_epilogue
+
+.rva	.LSEH_begin_mul_by_3_mod_256
+.rva	.LSEH_body_mul_by_3_mod_256
+.rva	.LSEH_info_mul_by_3_mod_256_prologue
+
+.rva	.LSEH_body_mul_by_3_mod_256
+.rva	.LSEH_epilogue_mul_by_3_mod_256
+.rva	.LSEH_info_mul_by_3_mod_256_body
+
+.rva	.LSEH_epilogue_mul_by_3_mod_256
+.rva	.LSEH_end_mul_by_3_mod_256
+.rva	.LSEH_info_mul_by_3_mod_256_epilogue
+
+.rva	.LSEH_begin_lshift_mod_256
+.rva	.LSEH_body_lshift_mod_256
+.rva	.LSEH_info_lshift_mod_256_prologue
+
+.rva	.LSEH_body_lshift_mod_256
+.rva	.LSEH_epilogue_lshift_mod_256
+.rva	.LSEH_info_lshift_mod_256_body
+
+.rva	.LSEH_epilogue_lshift_mod_256
+.rva	.LSEH_end_lshift_mod_256
+.rva	.LSEH_info_lshift_mod_256_epilogue
+
+.rva	.LSEH_begin_rshift_mod_256
+.rva	.LSEH_body_rshift_mod_256
+.rva	.LSEH_info_rshift_mod_256_prologue
+
+.rva	.LSEH_body_rshift_mod_256
+.rva	.LSEH_epilogue_rshift_mod_256
+.rva	.LSEH_info_rshift_mod_256_body
+
+.rva	.LSEH_epilogue_rshift_mod_256
+.rva	.LSEH_end_rshift_mod_256
+.rva	.LSEH_info_rshift_mod_256_epilogue
+
+.rva	.LSEH_begin_cneg_mod_256
+.rva	.LSEH_body_cneg_mod_256
+.rva	.LSEH_info_cneg_mod_256_prologue
+
+.rva	.LSEH_body_cneg_mod_256
+.rva	.LSEH_epilogue_cneg_mod_256
+.rva	.LSEH_info_cneg_mod_256_body
+
+.rva	.LSEH_epilogue_cneg_mod_256
+.rva	.LSEH_end_cneg_mod_256
+.rva	.LSEH_info_cneg_mod_256_epilogue
+
+.rva	.LSEH_begin_sub_mod_256
+.rva	.LSEH_body_sub_mod_256
+.rva	.LSEH_info_sub_mod_256_prologue
+
+.rva	.LSEH_body_sub_mod_256
+.rva	.LSEH_epilogue_sub_mod_256
+.rva	.LSEH_info_sub_mod_256_body
+
+.rva	.LSEH_epilogue_sub_mod_256
+.rva	.LSEH_end_sub_mod_256
+.rva	.LSEH_info_sub_mod_256_epilogue
+
+.rva	.LSEH_epilogue_check_mod_256
+.rva	.LSEH_end_check_mod_256
+.rva	.LSEH_info_check_mod_256_epilogue
+
+.rva	.LSEH_begin_add_n_check_mod_256
+.rva	.LSEH_body_add_n_check_mod_256
+.rva	.LSEH_info_add_n_check_mod_256_prologue
+
+.rva	.LSEH_body_add_n_check_mod_256
+.rva	.LSEH_epilogue_add_n_check_mod_256
+.rva	.LSEH_info_add_n_check_mod_256_body
+
+.rva	.LSEH_epilogue_add_n_check_mod_256
+.rva	.LSEH_end_add_n_check_mod_256
+.rva	.LSEH_info_add_n_check_mod_256_epilogue
+
+.rva	.LSEH_begin_sub_n_check_mod_256
+.rva	.LSEH_body_sub_n_check_mod_256
+.rva	.LSEH_info_sub_n_check_mod_256_prologue
+
+.rva	.LSEH_body_sub_n_check_mod_256
+.rva	.LSEH_epilogue_sub_n_check_mod_256
+.rva	.LSEH_info_sub_n_check_mod_256_body
+
+.rva	.LSEH_epilogue_sub_n_check_mod_256
+.rva	.LSEH_end_sub_n_check_mod_256
+.rva	.LSEH_info_sub_n_check_mod_256_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_add_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_add_mod_256_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_add_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_3_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_3_mod_256_body:
+.byte	1,0,11,0
+.byte	0x00,0xc4,0x00,0x00
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.LSEH_info_mul_by_3_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_lshift_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_lshift_mod_256_body:
+.byte	1,0,11,0
+.byte	0x00,0xc4,0x00,0x00
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.LSEH_info_lshift_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_rshift_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_rshift_mod_256_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_rshift_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_cneg_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_cneg_mod_256_body:
+.byte	1,0,11,0
+.byte	0x00,0xc4,0x00,0x00
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.LSEH_info_cneg_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sub_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sub_mod_256_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_sub_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_check_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_add_n_check_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_add_n_check_mod_256_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_add_n_check_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sub_n_check_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sub_n_check_mod_256_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_sub_n_check_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/add_mod_384-armv8.S b/crypto/blst_src/build/coff/add_mod_384-armv8.S
new file mode 100644
index 00000000000..2eff0677f54
--- /dev/null
+++ b/crypto/blst_src/build/coff/add_mod_384-armv8.S
@@ -0,0 +1,1056 @@
+.text
+
+.globl	add_mod_384
+
+.def	add_mod_384;
+.type	32;
+.endef
+.p2align	5
+add_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.def	__add_mod_384;
+.type	32;
+.endef
+.p2align	5
+__add_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+
+
+.globl	add_mod_384x
+
+.def	add_mod_384x;
+.type	32;
+.endef
+.p2align	5
+add_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	rshift_mod_384
+
+.def	rshift_mod_384;
+.type	32;
+.endef
+.p2align	5
+rshift_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_rshift_mod_384:
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.def	__rshift_mod_384;
+.type	32;
+.endef
+.p2align	5
+__rshift_mod_384:
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+
+
+.globl	div_by_2_mod_384
+
+.def	div_by_2_mod_384;
+.type	32;
+.endef
+.p2align	5
+div_by_2_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	lshift_mod_384
+
+.def	lshift_mod_384;
+.type	32;
+.endef
+.p2align	5
+lshift_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_lshift_mod_384:
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.def	__lshift_mod_384;
+.type	32;
+.endef
+.p2align	5
+__lshift_mod_384:
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+
+
+.globl	mul_by_3_mod_384
+
+.def	mul_by_3_mod_384;
+.type	32;
+.endef
+.p2align	5
+mul_by_3_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	mul_by_8_mod_384
+
+.def	mul_by_8_mod_384;
+.type	32;
+.endef
+.p2align	5
+mul_by_8_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	mul_by_3_mod_384x
+
+.def	mul_by_3_mod_384x;
+.type	32;
+.endef
+.p2align	5
+mul_by_3_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	mul_by_8_mod_384x
+
+.def	mul_by_8_mod_384x;
+.type	32;
+.endef
+.p2align	5
+mul_by_8_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	cneg_mod_384
+
+.def	cneg_mod_384;
+.type	32;
+.endef
+.p2align	5
+cneg_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x10,x10,x16,eq
+	csel	x11,x11,x17,eq
+	csel	x12,x12,x19,eq
+	csel	x13,x13,x20,eq
+	stp	x10,x11,[x0]
+	csel	x14,x14,x21,eq
+	stp	x12,x13,[x0,#16]
+	csel	x15,x15,x22,eq
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	sub_mod_384
+
+.def	sub_mod_384;
+.type	32;
+.endef
+.p2align	5
+sub_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.def	__sub_mod_384;
+.type	32;
+.endef
+.p2align	5
+__sub_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+
+
+.globl	sub_mod_384x
+
+.def	sub_mod_384x;
+.type	32;
+.endef
+.p2align	5
+sub_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	mul_by_1_plus_i_mod_384x
+
+.def	mul_by_1_plus_i_mod_384x;
+.type	32;
+.endef
+.p2align	5
+mul_by_1_plus_i_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	sgn0_pty_mod_384
+
+.def	sgn0_pty_mod_384;
+.type	32;
+.endef
+.p2align	5
+sgn0_pty_mod_384:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+
+
+.globl	sgn0_pty_mod_384x
+
+.def	sgn0_pty_mod_384x;
+.type	32;
+.endef
+.p2align	5
+sgn0_pty_mod_384x:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+
+.globl	vec_select_32
+
+.def	vec_select_32;
+.type	32;
+.endef
+.p2align	5
+vec_select_32:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	vec_select_48
+
+.def	vec_select_48;
+.type	32;
+.endef
+.p2align	5
+vec_select_48:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	vec_select_96
+
+.def	vec_select_96;
+.type	32;
+.endef
+.p2align	5
+vec_select_96:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	vec_select_192
+
+.def	vec_select_192;
+.type	32;
+.endef
+.p2align	5
+vec_select_192:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	vec_select_144
+
+.def	vec_select_144;
+.type	32;
+.endef
+.p2align	5
+vec_select_144:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	vec_select_288
+
+.def	vec_select_288;
+.type	32;
+.endef
+.p2align	5
+vec_select_288:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	vec_prefetch
+
+.def	vec_prefetch;
+.type	32;
+.endef
+.p2align	5
+vec_prefetch:
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	prfm	pldl1keep, [x0]
+	ret
+
+.globl	vec_is_zero_16x
+
+.def	vec_is_zero_16x;
+.type	32;
+.endef
+.p2align	5
+vec_is_zero_16x:
+	ld1	{v0.2d}, [x0], #16
+	lsr	x1, x1, #4
+	sub	x1, x1, #1
+	cbz	x1, .Loop_is_zero_done
+
+.Loop_is_zero:
+	ld1	{v1.2d}, [x0], #16
+	orr	v0.16b, v0.16b, v1.16b
+	sub	x1, x1, #1
+	cbnz	x1, .Loop_is_zero
+
+.Loop_is_zero_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+
+.globl	vec_is_equal_16x
+
+.def	vec_is_equal_16x;
+.type	32;
+.endef
+.p2align	5
+vec_is_equal_16x:
+	ld1	{v0.2d}, [x0], #16
+	ld1	{v1.2d}, [x1], #16
+	lsr	x2, x2, #4
+	eor	v0.16b, v0.16b, v1.16b
+
+.Loop_is_equal:
+	sub	x2, x2, #1
+	cbz	x2, .Loop_is_equal_done
+	ld1	{v1.2d}, [x0], #16
+	ld1	{v2.2d}, [x1], #16
+	eor	v1.16b, v1.16b, v2.16b
+	orr	v0.16b, v0.16b, v1.16b
+	b	.Loop_is_equal
+	nop
+
+.Loop_is_equal_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+
diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s
new file mode 100644
index 00000000000..d1c7ad6e689
--- /dev/null
+++ b/crypto/blst_src/build/coff/add_mod_384-x86_64.s
@@ -0,0 +1,2481 @@
+.text	
+
+.globl	add_mod_384
+
+.def	add_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+add_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_add_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_add_mod_384:
+
+
+	call	__add_mod_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_add_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_add_mod_384:
+
+.def	__add_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__add_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__add_mod_384_a_is_loaded:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	add_mod_384x
+
+.def	add_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+add_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_add_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$24,%rsp
+
+.LSEH_body_add_mod_384x:
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	24+0(%rsp),%r15
+
+	movq	24+8(%rsp),%r14
+
+	movq	24+16(%rsp),%r13
+
+	movq	24+24(%rsp),%r12
+
+	movq	24+32(%rsp),%rbx
+
+	movq	24+40(%rsp),%rbp
+
+	leaq	24+48(%rsp),%rsp
+
+.LSEH_epilogue_add_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_add_mod_384x:
+
+
+.globl	rshift_mod_384
+
+.def	rshift_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+rshift_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_rshift_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_rshift_mod_384:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_rshift_mod_384:
+	call	__rshift_mod_384
+	decl	%edx
+	jnz	.Loop_rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_rshift_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_rshift_mod_384:
+
+.def	__rshift_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__rshift_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rsi
+	movq	0(%rcx),%r14
+	andq	%r8,%rsi
+	movq	8(%rcx),%r15
+	negq	%rsi
+	movq	16(%rcx),%rax
+	andq	%rsi,%r14
+	movq	24(%rcx),%rbx
+	andq	%rsi,%r15
+	movq	32(%rcx),%rbp
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%rbx
+	adcq	%r12,%rbp
+	adcq	%r13,%rsi
+	sbbq	%r13,%r13
+
+	shrq	$1,%r14
+	movq	%r15,%r8
+	shrq	$1,%r15
+	movq	%rax,%r9
+	shrq	$1,%rax
+	movq	%rbx,%r10
+	shrq	$1,%rbx
+	movq	%rbp,%r11
+	shrq	$1,%rbp
+	movq	%rsi,%r12
+	shrq	$1,%rsi
+	shlq	$63,%r8
+	shlq	$63,%r9
+	orq	%r14,%r8
+	shlq	$63,%r10
+	orq	%r15,%r9
+	shlq	$63,%r11
+	orq	%rax,%r10
+	shlq	$63,%r12
+	orq	%rbx,%r11
+	shlq	$63,%r13
+	orq	%rbp,%r12
+	orq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+
+
+.globl	div_by_2_mod_384
+
+.def	div_by_2_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+div_by_2_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_div_by_2_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_div_by_2_mod_384:
+
+
+	movq	0(%rsi),%r8
+	movq	%rdx,%rcx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	call	__rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_div_by_2_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_div_by_2_mod_384:
+
+
+.globl	lshift_mod_384
+
+.def	lshift_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+lshift_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_lshift_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_lshift_mod_384:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_lshift_mod_384:
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdi,%rdi
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdi
+
+	movq	(%rsp),%rdi
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	decl	%edx
+	jnz	.Loop_lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_lshift_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_lshift_mod_384:
+
+.def	__lshift_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__lshift_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+
+
+
+.globl	mul_by_3_mod_384
+
+.def	mul_by_3_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_3_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_3_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rsi
+
+.LSEH_body_mul_by_3_mod_384:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_3_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_3_mod_384:
+
+.globl	mul_by_8_mod_384
+
+.def	mul_by_8_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_8_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_8_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_mul_by_8_mod_384:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_8_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_8_mod_384:
+
+
+.globl	mul_by_3_mod_384x
+
+.def	mul_by_3_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_3_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_3_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rsi
+
+.LSEH_body_mul_by_3_mod_384x:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	(%rsp),%rsi
+	leaq	48(%rdi),%rdi
+
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	movq	72(%rsi),%r11
+	movq	80(%rsi),%r12
+	movq	88(%rsi),%r13
+
+	call	__lshift_mod_384
+
+	movq	$48,%rdx
+	addq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_3_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_3_mod_384x:
+
+.globl	mul_by_8_mod_384x
+
+.def	mul_by_8_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_8_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_8_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rsi
+
+.LSEH_body_mul_by_8_mod_384x:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rsi
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,48+0(%rdi)
+	movq	%r9,48+8(%rdi)
+	movq	%r10,48+16(%rdi)
+	movq	%r11,48+24(%rdi)
+	movq	%r12,48+32(%rdi)
+	movq	%r13,48+40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_8_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_8_mod_384x:
+
+
+.globl	cneg_mod_384
+
+.def	cneg_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+cneg_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_cneg_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdx
+
+.LSEH_body_cneg_mod_384:
+
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rdx,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%rdx
+	movq	32(%rsi),%r12
+	orq	%r10,%rdx
+	movq	40(%rsi),%r13
+	orq	%r11,%rdx
+	movq	$-1,%rsi
+	orq	%r12,%rdx
+	orq	%r13,%rdx
+
+	movq	0(%rcx),%r14
+	cmovnzq	%rsi,%rdx
+	movq	8(%rcx),%r15
+	movq	16(%rcx),%rax
+	andq	%rdx,%r14
+	movq	24(%rcx),%rbx
+	andq	%rdx,%r15
+	movq	32(%rcx),%rbp
+	andq	%rdx,%rax
+	movq	40(%rcx),%rsi
+	andq	%rdx,%rbx
+	movq	0(%rsp),%rcx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	subq	%r8,%r14
+	sbbq	%r9,%r15
+	sbbq	%r10,%rax
+	sbbq	%r11,%rbx
+	sbbq	%r12,%rbp
+	sbbq	%r13,%rsi
+
+	orq	%rcx,%rcx
+
+	cmovzq	%r8,%r14
+	cmovzq	%r9,%r15
+	cmovzq	%r10,%rax
+	movq	%r14,0(%rdi)
+	cmovzq	%r11,%rbx
+	movq	%r15,8(%rdi)
+	cmovzq	%r12,%rbp
+	movq	%rax,16(%rdi)
+	cmovzq	%r13,%rsi
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rsi,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_cneg_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_cneg_mod_384:
+
+
+.globl	sub_mod_384
+
+.def	sub_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sub_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sub_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sub_mod_384:
+
+
+	call	__sub_mod_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sub_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sub_mod_384:
+
+.def	__sub_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	sub_mod_384x
+
+.def	sub_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sub_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sub_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$24,%rsp
+
+.LSEH_body_sub_mod_384x:
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	24+0(%rsp),%r15
+
+	movq	24+8(%rsp),%r14
+
+	movq	24+16(%rsp),%r13
+
+	movq	24+24(%rsp),%r12
+
+	movq	24+32(%rsp),%rbx
+
+	movq	24+40(%rsp),%rbp
+
+	leaq	24+48(%rsp),%rsp
+
+.LSEH_epilogue_sub_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sub_mod_384x:
+.globl	mul_by_1_plus_i_mod_384x
+
+.def	mul_by_1_plus_i_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_by_1_plus_i_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_by_1_plus_i_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$56,%rsp
+
+.LSEH_body_mul_by_1_plus_i_mod_384x:
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rbx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rcx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+	movq	%rdi,48(%rsp)
+	sbbq	%rdi,%rdi
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rbx
+	sbbq	80(%rsi),%rcx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rsi,%rsi
+
+	movq	%r8,0(%rsp)
+	movq	0(%rdx),%r8
+	movq	%r9,8(%rsp)
+	movq	8(%rdx),%r9
+	movq	%r10,16(%rsp)
+	movq	16(%rdx),%r10
+	movq	%r11,24(%rsp)
+	movq	24(%rdx),%r11
+	movq	%r12,32(%rsp)
+	andq	%rsi,%r8
+	movq	32(%rdx),%r12
+	movq	%r13,40(%rsp)
+	andq	%rsi,%r9
+	movq	40(%rdx),%r13
+	andq	%rsi,%r10
+	andq	%rsi,%r11
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	movq	48(%rsp),%rsi
+
+	addq	%r8,%r14
+	movq	0(%rsp),%r8
+	adcq	%r9,%r15
+	movq	8(%rsp),%r9
+	adcq	%r10,%rax
+	movq	16(%rsp),%r10
+	adcq	%r11,%rbx
+	movq	24(%rsp),%r11
+	adcq	%r12,%rcx
+	movq	32(%rsp),%r12
+	adcq	%r13,%rbp
+	movq	40(%rsp),%r13
+
+	movq	%r14,0(%rsi)
+	movq	%r8,%r14
+	movq	%r15,8(%rsi)
+	movq	%rax,16(%rsi)
+	movq	%r9,%r15
+	movq	%rbx,24(%rsi)
+	movq	%rcx,32(%rsi)
+	movq	%r10,%rax
+	movq	%rbp,40(%rsi)
+
+	subq	0(%rdx),%r8
+	movq	%r11,%rbx
+	sbbq	8(%rdx),%r9
+	sbbq	16(%rdx),%r10
+	movq	%r12,%rcx
+	sbbq	24(%rdx),%r11
+	sbbq	32(%rdx),%r12
+	movq	%r13,%rbp
+	sbbq	40(%rdx),%r13
+	sbbq	$0,%rdi
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,48(%rsi)
+	cmovcq	%rbx,%r11
+	movq	%r9,56(%rsi)
+	cmovcq	%rcx,%r12
+	movq	%r10,64(%rsi)
+	cmovcq	%rbp,%r13
+	movq	%r11,72(%rsi)
+	movq	%r12,80(%rsi)
+	movq	%r13,88(%rsi)
+
+	movq	56+0(%rsp),%r15
+
+	movq	56+8(%rsp),%r14
+
+	movq	56+16(%rsp),%r13
+
+	movq	56+24(%rsp),%r12
+
+	movq	56+32(%rsp),%rbx
+
+	movq	56+40(%rsp),%rbp
+
+	leaq	56+48(%rsp),%rsp
+
+.LSEH_epilogue_mul_by_1_plus_i_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_by_1_plus_i_mod_384x:
+.globl	sgn0_pty_mod_384
+
+.def	sgn0_pty_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0_pty_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0_pty_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+.LSEH_body_sgn0_pty_mod_384:
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%rcx
+	movq	40(%rdi),%rdx
+
+	xorq	%rax,%rax
+	movq	%r8,%rdi
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%rdi
+	andq	$2,%rax
+	orq	%rdi,%rax
+
+.LSEH_epilogue_sgn0_pty_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0_pty_mod_384:
+
+.globl	sgn0_pty_mod_384x
+
+.def	sgn0_pty_mod_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0_pty_mod_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0_pty_mod_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	subq	$8,%rsp
+
+.LSEH_body_sgn0_pty_mod_384x:
+
+
+	movq	48(%rdi),%r8
+	movq	56(%rdi),%r9
+	movq	64(%rdi),%r10
+	movq	72(%rdi),%r11
+	movq	80(%rdi),%rcx
+	movq	88(%rdi),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	leaq	0(%rdi),%rax
+	xorq	%rdi,%rdi
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rdi
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rdi
+
+	movq	%r8,0(%rsp)
+	notq	%rdi
+	andq	$1,%rbp
+	andq	$2,%rdi
+	orq	%rbp,%rdi
+
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%rcx
+	movq	40(%rax),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	xorq	%rax,%rax
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%rbx
+
+	notq	%rax
+
+	testq	%r8,%r8
+	cmovzq	%rdi,%rbp
+
+	testq	%rbx,%rbx
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%rbp
+	andq	$2,%rax
+	orq	%rbp,%rax
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_sgn0_pty_mod_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0_pty_mod_384x:
+.globl	vec_select_32
+
+.def	vec_select_32;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_32:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	16(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	16(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	16(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-16(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-16(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-16(%rcx)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-16(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_select_48
+
+.def	vec_select_48;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_48:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	24(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	24(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	24(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-24(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-24(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-24(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-24(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-24(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-24(%rcx)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-24(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_select_96
+
+.def	vec_select_96;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_96:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	48(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	48(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	48(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-48(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-48(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-48(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-48(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-48(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-48(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-48(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-48(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-48(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-48(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-48(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-48(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-48(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-48(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-48(%rcx)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-48(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_select_192
+
+.def	vec_select_192;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_192:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	96(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	96(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	96(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-96(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-96(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-96(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-96(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-96(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-96(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-96(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-96(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-96(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-96(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-96(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-96(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-96(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-96(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-96(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-96(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-96(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-96(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-96(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-96(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-96(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-96(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-96(%rcx)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-96(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_select_144
+
+.def	vec_select_144;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_144:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	72(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	72(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	72(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-72(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-72(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-72(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-72(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-72(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-72(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-72(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-72(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-72(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-72(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-72(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-72(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-72(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-72(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-72(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-72(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-72(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-72(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-72(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-72(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-72(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-72(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-72(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-72(%rcx)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-72(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_select_288
+
+.def	vec_select_288;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_select_288:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%r9d,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rdx),%xmm0
+	leaq	144(%rdx),%rdx
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%r8),%xmm1
+	leaq	144(%r8),%r8
+	pcmpeqd	%xmm5,%xmm4
+	leaq	144(%rcx),%rcx
+	pand	%xmm4,%xmm0
+	movdqu	0+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	176+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	176+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	192+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	192+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,192-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	208+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	208+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,208-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	224+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	224+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,224-144(%rcx)
+	pand	%xmm4,%xmm2
+	movdqu	240+16-144(%rdx),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	240+16-144(%r8),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,240-144(%rcx)
+	pand	%xmm4,%xmm0
+	movdqu	256+16-144(%rdx),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	256+16-144(%r8),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,256-144(%rcx)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,272-144(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_prefetch
+
+.def	vec_prefetch;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_prefetch:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	leaq	-1(%rcx,%rdx,1),%rdx
+	movq	$64,%rax
+	xorq	%r8,%r8
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	cmovaq	%r8,%rax
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	cmovaq	%r8,%rax
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	cmovaq	%r8,%rax
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	cmovaq	%r8,%rax
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	cmovaq	%r8,%rax
+	prefetchnta	(%rcx)
+	leaq	(%rcx,%rax,1),%rcx
+	cmpq	%rdx,%rcx
+	cmovaq	%rdx,%rcx
+	prefetchnta	(%rcx)
+	.byte	0xf3,0xc3
+
+.globl	vec_is_zero_16x
+
+.def	vec_is_zero_16x;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_is_zero_16x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%edx
+	movdqu	(%rcx),%xmm0
+	leaq	16(%rcx),%rcx
+
+.Loop_is_zero:
+	decl	%edx
+	jz	.Loop_is_zero_done
+	movdqu	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	por	%xmm1,%xmm0
+	jmp	.Loop_is_zero
+
+.Loop_is_zero_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%edx
+	testq	%rax,%rax
+	cmovnzl	%edx,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+
+.globl	vec_is_equal_16x
+
+.def	vec_is_equal_16x;	.scl 2;	.type 32;	.endef
+.p2align	5
+vec_is_equal_16x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%r8d
+	movdqu	(%rcx),%xmm0
+	movdqu	(%rdx),%xmm1
+	subq	%rcx,%rdx
+	leaq	16(%rcx),%rcx
+	pxor	%xmm1,%xmm0
+
+.Loop_is_equal:
+	decl	%r8d
+	jz	.Loop_is_equal_done
+	movdqu	(%rcx),%xmm1
+	movdqu	(%rcx,%rdx,1),%xmm2
+	leaq	16(%rcx),%rcx
+	pxor	%xmm2,%xmm1
+	por	%xmm1,%xmm0
+	jmp	.Loop_is_equal
+
+.Loop_is_equal_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%r8d
+	testq	%rax,%rax
+	cmovnzl	%r8d,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_add_mod_384
+.rva	.LSEH_body_add_mod_384
+.rva	.LSEH_info_add_mod_384_prologue
+
+.rva	.LSEH_body_add_mod_384
+.rva	.LSEH_epilogue_add_mod_384
+.rva	.LSEH_info_add_mod_384_body
+
+.rva	.LSEH_epilogue_add_mod_384
+.rva	.LSEH_end_add_mod_384
+.rva	.LSEH_info_add_mod_384_epilogue
+
+.rva	.LSEH_begin_add_mod_384x
+.rva	.LSEH_body_add_mod_384x
+.rva	.LSEH_info_add_mod_384x_prologue
+
+.rva	.LSEH_body_add_mod_384x
+.rva	.LSEH_epilogue_add_mod_384x
+.rva	.LSEH_info_add_mod_384x_body
+
+.rva	.LSEH_epilogue_add_mod_384x
+.rva	.LSEH_end_add_mod_384x
+.rva	.LSEH_info_add_mod_384x_epilogue
+
+.rva	.LSEH_begin_rshift_mod_384
+.rva	.LSEH_body_rshift_mod_384
+.rva	.LSEH_info_rshift_mod_384_prologue
+
+.rva	.LSEH_body_rshift_mod_384
+.rva	.LSEH_epilogue_rshift_mod_384
+.rva	.LSEH_info_rshift_mod_384_body
+
+.rva	.LSEH_epilogue_rshift_mod_384
+.rva	.LSEH_end_rshift_mod_384
+.rva	.LSEH_info_rshift_mod_384_epilogue
+
+.rva	.LSEH_begin_div_by_2_mod_384
+.rva	.LSEH_body_div_by_2_mod_384
+.rva	.LSEH_info_div_by_2_mod_384_prologue
+
+.rva	.LSEH_body_div_by_2_mod_384
+.rva	.LSEH_epilogue_div_by_2_mod_384
+.rva	.LSEH_info_div_by_2_mod_384_body
+
+.rva	.LSEH_epilogue_div_by_2_mod_384
+.rva	.LSEH_end_div_by_2_mod_384
+.rva	.LSEH_info_div_by_2_mod_384_epilogue
+
+.rva	.LSEH_begin_lshift_mod_384
+.rva	.LSEH_body_lshift_mod_384
+.rva	.LSEH_info_lshift_mod_384_prologue
+
+.rva	.LSEH_body_lshift_mod_384
+.rva	.LSEH_epilogue_lshift_mod_384
+.rva	.LSEH_info_lshift_mod_384_body
+
+.rva	.LSEH_epilogue_lshift_mod_384
+.rva	.LSEH_end_lshift_mod_384
+.rva	.LSEH_info_lshift_mod_384_epilogue
+
+.rva	.LSEH_begin_mul_by_3_mod_384
+.rva	.LSEH_body_mul_by_3_mod_384
+.rva	.LSEH_info_mul_by_3_mod_384_prologue
+
+.rva	.LSEH_body_mul_by_3_mod_384
+.rva	.LSEH_epilogue_mul_by_3_mod_384
+.rva	.LSEH_info_mul_by_3_mod_384_body
+
+.rva	.LSEH_epilogue_mul_by_3_mod_384
+.rva	.LSEH_end_mul_by_3_mod_384
+.rva	.LSEH_info_mul_by_3_mod_384_epilogue
+
+.rva	.LSEH_begin_mul_by_8_mod_384
+.rva	.LSEH_body_mul_by_8_mod_384
+.rva	.LSEH_info_mul_by_8_mod_384_prologue
+
+.rva	.LSEH_body_mul_by_8_mod_384
+.rva	.LSEH_epilogue_mul_by_8_mod_384
+.rva	.LSEH_info_mul_by_8_mod_384_body
+
+.rva	.LSEH_epilogue_mul_by_8_mod_384
+.rva	.LSEH_end_mul_by_8_mod_384
+.rva	.LSEH_info_mul_by_8_mod_384_epilogue
+
+.rva	.LSEH_begin_mul_by_3_mod_384x
+.rva	.LSEH_body_mul_by_3_mod_384x
+.rva	.LSEH_info_mul_by_3_mod_384x_prologue
+
+.rva	.LSEH_body_mul_by_3_mod_384x
+.rva	.LSEH_epilogue_mul_by_3_mod_384x
+.rva	.LSEH_info_mul_by_3_mod_384x_body
+
+.rva	.LSEH_epilogue_mul_by_3_mod_384x
+.rva	.LSEH_end_mul_by_3_mod_384x
+.rva	.LSEH_info_mul_by_3_mod_384x_epilogue
+
+.rva	.LSEH_begin_mul_by_8_mod_384x
+.rva	.LSEH_body_mul_by_8_mod_384x
+.rva	.LSEH_info_mul_by_8_mod_384x_prologue
+
+.rva	.LSEH_body_mul_by_8_mod_384x
+.rva	.LSEH_epilogue_mul_by_8_mod_384x
+.rva	.LSEH_info_mul_by_8_mod_384x_body
+
+.rva	.LSEH_epilogue_mul_by_8_mod_384x
+.rva	.LSEH_end_mul_by_8_mod_384x
+.rva	.LSEH_info_mul_by_8_mod_384x_epilogue
+
+.rva	.LSEH_begin_cneg_mod_384
+.rva	.LSEH_body_cneg_mod_384
+.rva	.LSEH_info_cneg_mod_384_prologue
+
+.rva	.LSEH_body_cneg_mod_384
+.rva	.LSEH_epilogue_cneg_mod_384
+.rva	.LSEH_info_cneg_mod_384_body
+
+.rva	.LSEH_epilogue_cneg_mod_384
+.rva	.LSEH_end_cneg_mod_384
+.rva	.LSEH_info_cneg_mod_384_epilogue
+
+.rva	.LSEH_begin_sub_mod_384
+.rva	.LSEH_body_sub_mod_384
+.rva	.LSEH_info_sub_mod_384_prologue
+
+.rva	.LSEH_body_sub_mod_384
+.rva	.LSEH_epilogue_sub_mod_384
+.rva	.LSEH_info_sub_mod_384_body
+
+.rva	.LSEH_epilogue_sub_mod_384
+.rva	.LSEH_end_sub_mod_384
+.rva	.LSEH_info_sub_mod_384_epilogue
+
+.rva	.LSEH_begin_sub_mod_384x
+.rva	.LSEH_body_sub_mod_384x
+.rva	.LSEH_info_sub_mod_384x_prologue
+
+.rva	.LSEH_body_sub_mod_384x
+.rva	.LSEH_epilogue_sub_mod_384x
+.rva	.LSEH_info_sub_mod_384x_body
+
+.rva	.LSEH_epilogue_sub_mod_384x
+.rva	.LSEH_end_sub_mod_384x
+.rva	.LSEH_info_sub_mod_384x_epilogue
+
+.rva	.LSEH_begin_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_body_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_info_mul_by_1_plus_i_mod_384x_prologue
+
+.rva	.LSEH_body_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_epilogue_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_info_mul_by_1_plus_i_mod_384x_body
+
+.rva	.LSEH_epilogue_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_end_mul_by_1_plus_i_mod_384x
+.rva	.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue
+
+.rva	.LSEH_begin_sgn0_pty_mod_384
+.rva	.LSEH_body_sgn0_pty_mod_384
+.rva	.LSEH_info_sgn0_pty_mod_384_prologue
+
+.rva	.LSEH_body_sgn0_pty_mod_384
+.rva	.LSEH_epilogue_sgn0_pty_mod_384
+.rva	.LSEH_info_sgn0_pty_mod_384_body
+
+.rva	.LSEH_epilogue_sgn0_pty_mod_384
+.rva	.LSEH_end_sgn0_pty_mod_384
+.rva	.LSEH_info_sgn0_pty_mod_384_epilogue
+
+.rva	.LSEH_begin_sgn0_pty_mod_384x
+.rva	.LSEH_body_sgn0_pty_mod_384x
+.rva	.LSEH_info_sgn0_pty_mod_384x_prologue
+
+.rva	.LSEH_body_sgn0_pty_mod_384x
+.rva	.LSEH_epilogue_sgn0_pty_mod_384x
+.rva	.LSEH_info_sgn0_pty_mod_384x_body
+
+.rva	.LSEH_epilogue_sgn0_pty_mod_384x
+.rva	.LSEH_end_sgn0_pty_mod_384x
+.rva	.LSEH_info_sgn0_pty_mod_384x_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_add_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_add_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_add_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_add_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_add_mod_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x03,0x00
+.byte	0x00,0xe4,0x04,0x00
+.byte	0x00,0xd4,0x05,0x00
+.byte	0x00,0xc4,0x06,0x00
+.byte	0x00,0x34,0x07,0x00
+.byte	0x00,0x54,0x08,0x00
+.byte	0x00,0x74,0x0a,0x00
+.byte	0x00,0x64,0x0b,0x00
+.byte	0x00,0x82
+.byte	0x00,0x00
+.LSEH_info_add_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_rshift_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_rshift_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_rshift_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_div_by_2_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_div_by_2_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_div_by_2_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_lshift_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_lshift_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_lshift_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_3_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_3_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mul_by_3_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_8_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_8_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mul_by_8_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_3_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_3_mod_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mul_by_3_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_8_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_8_mod_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mul_by_8_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_cneg_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_cneg_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_cneg_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sub_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sub_mod_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sub_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sub_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sub_mod_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x03,0x00
+.byte	0x00,0xe4,0x04,0x00
+.byte	0x00,0xd4,0x05,0x00
+.byte	0x00,0xc4,0x06,0x00
+.byte	0x00,0x34,0x07,0x00
+.byte	0x00,0x54,0x08,0x00
+.byte	0x00,0x74,0x0a,0x00
+.byte	0x00,0x64,0x0b,0x00
+.byte	0x00,0x82
+.byte	0x00,0x00
+.LSEH_info_sub_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_by_1_plus_i_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_by_1_plus_i_mod_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x07,0x00
+.byte	0x00,0xe4,0x08,0x00
+.byte	0x00,0xd4,0x09,0x00
+.byte	0x00,0xc4,0x0a,0x00
+.byte	0x00,0x34,0x0b,0x00
+.byte	0x00,0x54,0x0c,0x00
+.byte	0x00,0x74,0x0e,0x00
+.byte	0x00,0x64,0x0f,0x00
+.byte	0x00,0xc2
+.byte	0x00,0x00
+.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0_pty_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0_pty_mod_384_body:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+.LSEH_info_sgn0_pty_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0_pty_mod_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0_pty_mod_384x_body:
+.byte	1,0,9,0
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00
+.LSEH_info_sgn0_pty_mod_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
new file mode 100644
index 00000000000..79976cc0e7a
--- /dev/null
+++ b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
@@ -0,0 +1,326 @@
+.text	
+
+.def	__add_mod_384x384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__add_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	addq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	adcq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	adcq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	adcq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	adcq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	adcq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	adcq	48(%rdx),%r14
+	movq	%r9,8(%rdi)
+	adcq	56(%rdx),%r15
+	movq	%r10,16(%rdi)
+	adcq	64(%rdx),%rax
+	movq	%r12,32(%rdi)
+	movq	%r14,%r8
+	adcq	72(%rdx),%rbx
+	movq	%r11,24(%rdi)
+	movq	%r15,%r9
+	adcq	80(%rdx),%rbp
+	movq	%r13,40(%rdi)
+	movq	%rax,%r10
+	adcq	88(%rdx),%rsi
+	movq	%rbx,%r11
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%rbp,%r12
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%rbx
+	sbbq	32(%rcx),%rbp
+	movq	%rsi,%r13
+	sbbq	40(%rcx),%rsi
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r14
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%r14,48(%rdi)
+	cmovcq	%r11,%rbx
+	movq	%r15,56(%rdi)
+	cmovcq	%r12,%rbp
+	movq	%rax,64(%rdi)
+	cmovcq	%r13,%rsi
+	movq	%rbx,72(%rdi)
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__sub_mod_384x384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	add_mod_384x384
+
+.def	add_mod_384x384;	.scl 2;	.type 32;	.endef
+.p2align	5
+add_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_add_mod_384x384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_add_mod_384x384:
+
+
+	call	__add_mod_384x384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_add_mod_384x384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_add_mod_384x384:
+
+.globl	sub_mod_384x384
+
+.def	sub_mod_384x384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sub_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sub_mod_384x384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sub_mod_384x384:
+
+
+	call	__sub_mod_384x384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sub_mod_384x384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sub_mod_384x384:
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_add_mod_384x384
+.rva	.LSEH_body_add_mod_384x384
+.rva	.LSEH_info_add_mod_384x384_prologue
+
+.rva	.LSEH_body_add_mod_384x384
+.rva	.LSEH_epilogue_add_mod_384x384
+.rva	.LSEH_info_add_mod_384x384_body
+
+.rva	.LSEH_epilogue_add_mod_384x384
+.rva	.LSEH_end_add_mod_384x384
+.rva	.LSEH_info_add_mod_384x384_epilogue
+
+.rva	.LSEH_begin_sub_mod_384x384
+.rva	.LSEH_body_sub_mod_384x384
+.rva	.LSEH_info_sub_mod_384x384_prologue
+
+.rva	.LSEH_body_sub_mod_384x384
+.rva	.LSEH_epilogue_sub_mod_384x384
+.rva	.LSEH_info_sub_mod_384x384_body
+
+.rva	.LSEH_epilogue_sub_mod_384x384
+.rva	.LSEH_end_sub_mod_384x384
+.rva	.LSEH_info_sub_mod_384x384_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_add_mod_384x384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_add_mod_384x384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_add_mod_384x384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sub_mod_384x384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sub_mod_384x384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sub_mod_384x384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
new file mode 100644
index 00000000000..17c3d25278f
--- /dev/null
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
@@ -0,0 +1,798 @@
+.text
+
+.globl	ct_inverse_mod_256
+.def	ct_inverse_mod_256;
+.type	32;
+.endef
+.p2align	5
+ct_inverse_mod_256:
+.long	3573752639
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+.long	3573752767
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+.def	__smul_256x63;
+.type	32;
+.endef
+.p2align	5
+__smul_256x63:
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	csel	x22, x22, xzr, ne
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	csel	x23, x23, xzr, ne
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+
+
+.def	__smul_512x63_tail;
+.type	32;
+.endef
+.p2align	5
+__smul_512x63_tail:
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+
+
+.def	__smul_256_n_shift_by_31;
+.type	32;
+.endef
+.p2align	5
+__smul_256_n_shift_by_31:
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+
+.def	__ab_approximation_31_256;
+.type	32;
+.endef
+.p2align	4
+__ab_approximation_31_256:
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x5, ne
+	orr	x19, x7, x11	// and ones before top-most, ...
+	csel	x10, x10, x9, ne
+
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x4, ne
+	orr	x19, x7, x11	// and one more, ...
+	csel	x10, x10, x8, ne
+
+	clz	x19, x19
+	cmp	x19, #64
+	csel	x19, x19, xzr, ne
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+
+
+.def	__inner_loop_31_256;
+.type	32;
+.endef
+.p2align	4
+__inner_loop_31_256:
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
+	csel	x13, x13, x19,   hs
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, .Loop_31_256
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+
+
+.def	__inner_loop_62_256;
+.type	32;
+.endef
+.p2align	4
+__inner_loop_62_256:
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	x20, x13
+	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
+	csel	x14, x14, x19,     hs
+	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
+	csel	x15, x15, x20,     hs
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62_256
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
new file mode 100644
index 00000000000..e7d4a6313b1
--- /dev/null
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
@@ -0,0 +1,1209 @@
+.text	
+
+.globl	ct_inverse_mod_256
+.def	ct_inverse_mod_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+ct_inverse_mod_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_ct_inverse_mod_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$1072,%rsp
+
+.LSEH_body_ct_inverse_mod_256:
+
+
+	leaq	48+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	movq	0(%rdx),%r12
+	movq	8(%rdx),%r13
+	movq	16(%rdx),%r14
+	movq	24(%rdx),%r15
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rax,%rsi
+
+
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,64(%rdi)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,72(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+
+	movq	64(%rsi),%r8
+	movq	104(%rsi),%r12
+	movq	%r8,%r9
+	imulq	0(%rsp),%r8
+	movq	%r12,%r13
+	imulq	8(%rsp),%r12
+	addq	%r12,%r8
+	movq	%r8,32(%rdi)
+	sarq	$63,%r8
+	movq	%r8,40(%rdi)
+	movq	%r8,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r8,64(%rdi)
+	leaq	64(%rsi),%rsi
+
+	imulq	%rdx,%r9
+	imulq	%rcx,%r13
+	addq	%r13,%r9
+	movq	%r9,72(%rdi)
+	sarq	$63,%r9
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	movq	%r9,96(%rdi)
+	movq	%r9,104(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	sarq	$63,%rbp
+	movq	%rbp,40(%rdi)
+	movq	%rbp,48(%rdi)
+	movq	%rbp,56(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+
+	xorq	$256+64,%rsi
+	movl	$47,%edx
+
+	movq	0(%rsi),%r8
+
+	movq	32(%rsi),%r10
+
+	call	__inner_loop_62_256
+
+
+
+
+
+
+
+	leaq	64(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_512x63
+	adcq	%rbp,%rdx
+
+	movq	40(%rsp),%rsi
+	movq	%rdx,%rax
+	sarq	$63,%rdx
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%rax
+
+	movq	%rax,%rdx
+	negq	%rax
+	orq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	xorq	%rax,%r8
+	xorq	%rcx,%rcx
+	xorq	%rax,%r9
+	subq	%rax,%rcx
+	xorq	%rax,%r10
+	xorq	%rax,%rdx
+	addq	%rcx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	1072(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_ct_inverse_mod_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_ct_inverse_mod_256:
+.def	__smulq_512x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_512x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%r11,24(%rdi)
+
+	movq	40(%rsi),%r8
+	movq	48(%rsi),%r9
+	movq	56(%rsi),%r10
+	movq	64(%rsi),%r11
+	movq	72(%rsi),%r12
+	movq	80(%rsi),%r13
+	movq	88(%rsi),%r14
+	movq	96(%rsi),%r15
+
+	movq	%rcx,%rdx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rcx
+	addq	%rax,%rcx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rcx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rcx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rcx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rcx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rcx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	imulq	%rcx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	%rbx,%r12
+	adcq	%rbp,%r13
+	adcq	%rbp,%r14
+	adcq	%rbp,%r15
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__smulq_256x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_256x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%rcx,%rdx
+	movq	40+0(%rsi),%r12
+	movq	40+8(%rsi),%r13
+	movq	40+16(%rsi),%r14
+	movq	40+24(%rsi),%r15
+	movq	40+32(%rsi),%rcx
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rcx
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rcx
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	andq	%rbx,%rcx
+	negq	%rcx
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%rbp,32(%rdi)
+
+	.byte	0xf3,0xc3
+
+.def	__smulq_256_n_shift_by_31;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_256_n_shift_by_31:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,0(%rdi)
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rbp
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	andq	%rbx,%rbp
+	negq	%rbp
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r14
+	movq	32+24(%rsi),%r15
+
+	movq	%rcx,%rbx
+	sarq	$63,%rcx
+	xorq	%rax,%rax
+	subq	%rcx,%rax
+
+	xorq	%rcx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rcx,%r12
+	xorq	%rcx,%r13
+	xorq	%rcx,%r14
+	xorq	%rcx,%r15
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	andq	%rbx,%rcx
+	negq	%rcx
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	0(%rdi),%rdx
+	movq	8(%rdi),%rcx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%rbp,%r11
+
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	xorq	%rbp,%rdx
+	xorq	%rbp,%rcx
+	addq	%rax,%rdx
+	addq	%rax,%rcx
+
+	.byte	0xf3,0xc3
+
+.def	__ab_approximation_31_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__ab_approximation_31_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	24(%rsi),%r9
+	movq	56(%rsi),%r11
+	movq	16(%rsi),%rbx
+	movq	48(%rsi),%rbp
+	movq	8(%rsi),%r8
+	movq	40(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	32(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	notq	%rax
+	andq	%rax,%r9
+	andq	%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31_256
+
+	.byte	0xf3,0xc3
+
+.def	__inner_loop_31_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_31_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31_256:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edx
+	jnz	.Loop_31_256
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+
+
+.def	__inner_loop_62_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_62_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	%edx,%r15d
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	%rdx,%r13
+	movq	%rdx,%r14
+
+.Loop_62_256:
+	xorq	%rax,%rax
+	testq	%r14,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	%r14,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%r15d
+	jnz	.Loop_62_256
+
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_ct_inverse_mod_256
+.rva	.LSEH_body_ct_inverse_mod_256
+.rva	.LSEH_info_ct_inverse_mod_256_prologue
+
+.rva	.LSEH_body_ct_inverse_mod_256
+.rva	.LSEH_epilogue_ct_inverse_mod_256
+.rva	.LSEH_info_ct_inverse_mod_256_body
+
+.rva	.LSEH_epilogue_ct_inverse_mod_256
+.rva	.LSEH_end_ct_inverse_mod_256
+.rva	.LSEH_info_ct_inverse_mod_256_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_ct_inverse_mod_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_ct_inverse_mod_256_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x86,0x00
+.byte	0x00,0xe4,0x87,0x00
+.byte	0x00,0xd4,0x88,0x00
+.byte	0x00,0xc4,0x89,0x00
+.byte	0x00,0x34,0x8a,0x00
+.byte	0x00,0x54,0x8b,0x00
+.byte	0x00,0x74,0x8d,0x00
+.byte	0x00,0x64,0x8e,0x00
+.byte	0x00,0x01,0x8c,0x00
+.LSEH_info_ct_inverse_mod_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
new file mode 100644
index 00000000000..65193f1e96a
--- /dev/null
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
@@ -0,0 +1,729 @@
+.text
+
+.globl	ct_inverse_mod_383
+.def	ct_inverse_mod_383;
+.type	32;
+.endef
+.p2align	5
+ct_inverse_mod_383:
+.long	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+.long	3573752767
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.def	__smul_383x63;
+.type	32;
+.endef
+.p2align	5
+__smul_383x63:
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+
+
+.def	__smul_767x63_tail;
+.type	32;
+.endef
+.p2align	5
+__smul_767x63_tail:
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+
+
+.def	__smul_383_n_shift_by_62;
+.type	32;
+.endef
+.p2align	5
+__smul_383_n_shift_by_62:
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+
+.def	__ab_approximation_62;
+.type	32;
+.endef
+.p2align	4
+__ab_approximation_62:
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x22, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x22, x8, x14
+	csel	x13, x13, x10, ne
+
+	clz	x22, x22
+	cmp	x22, #64
+	csel	x22, x22, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+
+.def	__inner_loop_62;
+.type	32;
+.endef
+.p2align	4
+__inner_loop_62:
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x14, x14, x8, hs
+	csel	x3, x26, x24, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x8, x27, x25, hs
+	csel	x15, x15, x17,       hs	// exchange |f0| and |f1|
+	csel	x17, x17, x22,     hs
+	csel	x16, x16, x19,       hs	// exchange |g0| and |g1|
+	csel	x19, x19, x23,     hs
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
new file mode 100644
index 00000000000..34336ff486b
--- /dev/null
+++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
@@ -0,0 +1,334 @@
+.text
+
+.globl	ct_is_square_mod_384
+.def	ct_is_square_mod_384;
+.type	32;
+.endef
+.p2align	5
+ct_is_square_mod_384:
+.long	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the .Legendre symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.p2align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// and loaded
+	//ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+.long	3573752767
+	ret
+
+
+.def	__smul_384_n_shift_by_30;
+.type	32;
+.endef
+.p2align	5
+__smul_384_n_shift_by_30:
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+
+.def	__ab_approximation_30;
+.type	32;
+.endef
+.p2align	4
+__ab_approximation_30:
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x21, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x21, x8, x14	// and one more, ...
+	csel	x13, x13, x10, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x3, ne
+	orr	x21, x8, x14
+	csel	x13, x13, x9, ne
+
+	clz	x21, x21
+	cmp	x21, #64
+	csel	x21, x21, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+
+
+.def	__inner_loop_30;
+.type	32;
+.endef
+.p2align	4
+__inner_loop_30:
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x20, x20, x17,  hs	// exchange |fg0| and |fg1|
+	csel	x17, x17, x21, hs
+	csel	x2,   x2,   x25, hs
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, .Loop_30
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+
+.def	__inner_loop_48;
+.type	32;
+.endef
+.p2align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	x24, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x3, x9
+	sub	x15, x15, #1
+	and	x21, x9, x24
+	sub	x22, x9, x3		// |b_|-|a_|
+	subs	x23, x3, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x3, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x2,   x2,   x25, hs
+	add	x23, x9, #2
+	lsr	x3, x3, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, .Loop_48
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
new file mode 100644
index 00000000000..ee4790321e6
--- /dev/null
+++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
@@ -0,0 +1,505 @@
+.text	
+
+.globl	ct_is_square_mod_384
+.def	ct_is_square_mod_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+ct_is_square_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_ct_is_square_mod_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$536,%rsp
+
+.LSEH_body_ct_is_square_mod_384:
+
+
+	leaq	24+255(%rsp),%rax
+	andq	$-256,%rax
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%rdi
+	movq	%rax,%rsi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rcx,72(%rax)
+	movq	%rdx,80(%rax)
+	movq	%rdi,88(%rax)
+
+	xorq	%rbp,%rbp
+	movl	$24,%ecx
+	jmp	.Loop_is_square
+
+.p2align	5
+.Loop_is_square:
+	movl	%ecx,16(%rsp)
+
+	call	__ab_approximation_30
+	movq	%rax,0(%rsp)
+	movq	%rbx,8(%rsp)
+
+	movq	$128+48,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	-48(%rdi),%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movl	16(%rsp),%ecx
+	xorq	$128,%rsi
+
+	andq	48(%rdi),%r14
+	shrq	$1,%r14
+	addq	%r14,%rbp
+
+	subl	$1,%ecx
+	jnz	.Loop_is_square
+
+
+
+
+	movq	48(%rsi),%r9
+	call	__inner_loop_48
+
+	movq	$1,%rax
+	andq	%rbp,%rax
+	xorq	$1,%rax
+
+	leaq	536(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_ct_is_square_mod_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_ct_is_square_mod_384:
+
+.def	__smulq_384_n_shift_by_30;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_384_n_shift_by_30:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r14
+	andq	%rbx,%r14
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r14
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r15
+	andq	%rbx,%r15
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r15
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r15
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%r15,%r14
+
+	shrdq	$30,%r9,%r8
+	shrdq	$30,%r10,%r9
+	shrdq	$30,%r11,%r10
+	shrdq	$30,%r12,%r11
+	shrdq	$30,%r13,%r12
+	shrdq	$30,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbx,%rbx
+	subq	%r14,%rbx
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.def	__ab_approximation_30;	.scl 3;	.type 32;	.endef
+.p2align	5
+__ab_approximation_30:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	88(%rsi),%rbx
+	movq	80(%rsi),%r15
+	movq	72(%rsi),%r14
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r11,%r12
+	movq	64(%rsi),%r11
+	cmovzq	%r14,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r10,%r12
+	movq	56(%rsi),%r10
+	cmovzq	%r11,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r9,%r12
+	movq	48(%rsi),%r9
+	cmovzq	%r10,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r8,%r12
+	cmovzq	%r9,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r13
+	cmovzq	%r9,%rbx
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%r12,%r13
+	shldq	%cl,%r15,%rbx
+
+	movq	$0xFFFFFFFF00000000,%rax
+	movl	%r8d,%r8d
+	movl	%r9d,%r9d
+	andq	%rax,%r13
+	andq	%rax,%rbx
+	orq	%r13,%r8
+	orq	%rbx,%r9
+
+	jmp	__inner_loop_30
+
+	.byte	0xf3,0xc3
+
+.def	__inner_loop_30;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_30:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rbx
+	movq	$0x800000007FFFFFFF,%rcx
+	leaq	-1(%rbx),%r15
+	movl	$30,%edi
+
+.Loop_30:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbx,%r12
+	movq	%rcx,%r13
+	movq	%rbp,%r14
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rcx,%rbx
+	cmovbq	%r12,%rcx
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+	subq	%rcx,%rbx
+	addq	%r15,%rbx
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbx
+	cmovzq	%r13,%rcx
+	cmovzq	%r14,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rcx,%rcx
+	leaq	(%rax,%rbp,1),%rbp
+	subq	%r15,%rcx
+
+	subl	$1,%edi
+	jnz	.Loop_30
+
+	shrq	$32,%r15
+	movl	%ebx,%eax
+	shrq	$32,%rbx
+	movl	%ecx,%edx
+	shrq	$32,%rcx
+	subq	%r15,%rax
+	subq	%r15,%rbx
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+
+	.byte	0xf3,0xc3
+
+
+.def	__inner_loop_48;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_48:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	$48,%edi
+
+.Loop_48:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbp,%r12
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rax,%rbp
+
+	subl	$1,%edi
+	jnz	.Loop_48
+
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_ct_is_square_mod_384
+.rva	.LSEH_body_ct_is_square_mod_384
+.rva	.LSEH_info_ct_is_square_mod_384_prologue
+
+.rva	.LSEH_body_ct_is_square_mod_384
+.rva	.LSEH_epilogue_ct_is_square_mod_384
+.rva	.LSEH_info_ct_is_square_mod_384_body
+
+.rva	.LSEH_epilogue_ct_is_square_mod_384
+.rva	.LSEH_end_ct_is_square_mod_384
+.rva	.LSEH_info_ct_is_square_mod_384_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_ct_is_square_mod_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_ct_is_square_mod_384_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x43,0x00
+.byte	0x00,0xe4,0x44,0x00
+.byte	0x00,0xd4,0x45,0x00
+.byte	0x00,0xc4,0x46,0x00
+.byte	0x00,0x34,0x47,0x00
+.byte	0x00,0x54,0x48,0x00
+.byte	0x00,0x74,0x4a,0x00
+.byte	0x00,0x64,0x4b,0x00
+.byte	0x00,0x01,0x49,0x00
+.LSEH_info_ct_is_square_mod_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..42f058a3c8d
--- /dev/null
+++ b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
@@ -0,0 +1,1221 @@
+.text	
+
+.globl	ct_inverse_mod_383
+.def	ct_inverse_mod_383;	.scl 2;	.type 32;	.endef
+.p2align	5
+ct_inverse_mod_383:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_ct_inverse_mod_383:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$1112,%rsp
+
+.LSEH_body_ct_inverse_mod_383:
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	48(%rsi),%r10
+	movq	56(%rsi),%r11
+	call	__inner_loop_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	movq	%r8,0(%rdi)
+	movq	%r10,48(%rdi)
+
+
+
+	leaq	96(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+
+	xorq	$256+96,%rsi
+	movl	$22,%edi
+
+	movq	0(%rsi),%r8
+	xorq	%r9,%r9
+	movq	48(%rsi),%r10
+	xorq	%r11,%r11
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_ct_inverse_mod_383:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_ct_inverse_mod_383:
+.def	__smulq_767x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_767x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	%r9,8(%rdi)
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r10,16(%rdi)
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r11,24(%rdi)
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r12,32(%rdi)
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	movq	%rdx,%rsi
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rsi
+	addq	%rax,%rsi
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rbx
+	xorq	%rdx,%rbp
+	xorq	%rdx,%rcx
+	xorq	%rdx,%rdi
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulq	%rsi
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rsi
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rsi
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rsi
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rsi
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+	mulq	%rsi
+	addq	%rax,%rbp
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+	movq	8(%rsp),%rdx
+	imulq	%rsi,%rax
+	movq	16(%rsp),%rsi
+	addq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+
+.def	__smulq_383x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_383x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.def	__smulq_383_n_shift_by_62;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulq_383_n_shift_by_62:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	48(%rsi),%rsi
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$62,%r9,%r8
+	shrdq	$62,%r10,%r9
+	shrdq	$62,%r11,%r10
+	shrdq	$62,%r12,%r11
+	shrdq	$62,%r13,%r12
+	shrdq	$62,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+
+.def	__ab_approximation_62;	.scl 3;	.type 32;	.endef
+.p2align	5
+__ab_approximation_62:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	16(%rsi),%r8
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	8(%rsi),%r8
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	0(%rsi),%r8
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	jmp	__inner_loop_62
+
+	.byte	0xf3,0xc3
+
+.def	__inner_loop_62;	.scl 3;	.type 32;	.endef
+.p2align	3
+.long	0
+__inner_loop_62:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+	movq	%rsi,8(%rsp)
+
+.Loop_62:
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	testq	$1,%r8
+	movq	%r10,%rbp
+	movq	%r11,%r14
+	cmovnzq	%r10,%rax
+	cmovnzq	%r11,%rbx
+	subq	%r8,%rbp
+	sbbq	%r9,%r14
+	movq	%r8,%r15
+	movq	%r9,%rsi
+	subq	%rax,%r8
+	sbbq	%rbx,%r9
+	cmovcq	%rbp,%r8
+	cmovcq	%r14,%r9
+	cmovcq	%r15,%r10
+	cmovcq	%rsi,%r11
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrdq	$1,%r9,%r8
+	shrq	$1,%r9
+	testq	$1,%r15
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	movq	8(%rsp),%rsi
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_ct_inverse_mod_383
+.rva	.LSEH_body_ct_inverse_mod_383
+.rva	.LSEH_info_ct_inverse_mod_383_prologue
+
+.rva	.LSEH_body_ct_inverse_mod_383
+.rva	.LSEH_epilogue_ct_inverse_mod_383
+.rva	.LSEH_info_ct_inverse_mod_383_body
+
+.rva	.LSEH_epilogue_ct_inverse_mod_383
+.rva	.LSEH_end_ct_inverse_mod_383
+.rva	.LSEH_info_ct_inverse_mod_383_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_ct_inverse_mod_383_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_ct_inverse_mod_383_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x8b,0x00
+.byte	0x00,0xe4,0x8c,0x00
+.byte	0x00,0xd4,0x8d,0x00
+.byte	0x00,0xc4,0x8e,0x00
+.byte	0x00,0x34,0x8f,0x00
+.byte	0x00,0x54,0x90,0x00
+.byte	0x00,0x74,0x92,0x00
+.byte	0x00,0x64,0x93,0x00
+.byte	0x00,0x01,0x91,0x00
+.LSEH_info_ct_inverse_mod_383_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..7c13e56eb2a
--- /dev/null
+++ b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
@@ -0,0 +1,1596 @@
+.text	
+
+.globl	ctx_inverse_mod_383
+.def	ctx_inverse_mod_383;	.scl 2;	.type 32;	.endef
+.p2align	5
+ctx_inverse_mod_383:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_ctx_inverse_mod_383:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$1112,%rsp
+
+.LSEH_body_ctx_inverse_mod_383:
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+
+	xorq	$256+96,%rsi
+	movl	$53,%edi
+
+	movq	0(%rsi),%r8
+
+	movq	48(%rsi),%r10
+
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulx_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_ctx_inverse_mod_383:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_ctx_inverse_mod_383:
+.def	__smulx_767x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulx_767x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+	movq	%rcx,%rax
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	sarq	$63,%rax
+	xorq	%rsi,%rsi
+	subq	%rax,%rsi
+
+	xorq	%rax,%rdx
+	addq	%rsi,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%rax,%r13
+	xorq	%rax,%r14
+	xorq	%rax,%r15
+	xorq	%rax,%rbx
+	xorq	%rax,%rbp
+	xorq	%rax,%rcx
+	xorq	%rax,%rdi
+	addq	%rsi,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulxq	%r8,%r8,%rax
+	mulxq	%r9,%r9,%rsi
+	addq	%rax,%r9
+	mulxq	%r10,%r10,%rax
+	adcq	%rsi,%r10
+	mulxq	%r11,%r11,%rsi
+	adcq	%rax,%r11
+	mulxq	%r12,%r12,%rax
+	adcq	%rsi,%r12
+	mulxq	%r13,%r13,%rsi
+	adcq	%rax,%r13
+	mulxq	%r14,%r14,%rax
+	adcq	%rsi,%r14
+	mulxq	%r15,%r15,%rsi
+	adcq	%rax,%r15
+	mulxq	%rbx,%rbx,%rax
+	adcq	%rsi,%rbx
+	mulxq	%rbp,%rbp,%rsi
+	adcq	%rax,%rbp
+	mulxq	%rcx,%rcx,%rax
+	adcq	%rsi,%rcx
+	mulxq	%rdi,%rdi,%rsi
+	movq	8(%rsp),%rdx
+	movq	16(%rsp),%rsi
+	adcq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+
+.def	__smulx_383x63;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulx_383x63:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	movq	%rcx,%rdx
+	adcq	%rbp,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	adcq	%rbp,%r13
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.def	__smulx_383_n_shift_by_31;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulx_383_n_shift_by_31:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	xorq	%r14,%r14
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	%rdx,%r14
+
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%rax
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%rax,%r12
+	shrdq	$31,%r14,%rax
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+
+.def	__smulx_191_n_shift_by_31;	.scl 3;	.type 32;	.endef
+.p2align	5
+__smulx_191_n_shift_by_31:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%r10,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r10
+	addq	%rbp,%r9
+	adcq	$0,%r10
+	imulq	%rdx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+	movq	48+0(%rsi),%r11
+	movq	48+8(%rsi),%r12
+	movq	48+16(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r11,%r11,%rbp
+	mulxq	%r12,%r12,%r13
+	addq	%rbp,%r12
+	adcq	$0,%r13
+	imulq	%rdx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	%r9,%r12
+	adcq	%r10,%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%r13,%r12
+	shrdq	$31,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+
+.def	__ab_approximation_31;	.scl 3;	.type 32;	.endef
+.p2align	5
+__ab_approximation_31:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	16(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	8(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	andnq	%r9,%rax,%r9
+	andnq	%r11,%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31
+
+	.byte	0xf3,0xc3
+
+.def	__inner_loop_31;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_31:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edi
+	jnz	.Loop_31
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+
+
+.def	__inner_loop_62;	.scl 3;	.type 32;	.endef
+.p2align	5
+__inner_loop_62:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+
+.Loop_62:
+	xorq	%rax,%rax
+	testq	$1,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	$1,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_ctx_inverse_mod_383
+.rva	.LSEH_body_ctx_inverse_mod_383
+.rva	.LSEH_info_ctx_inverse_mod_383_prologue
+
+.rva	.LSEH_body_ctx_inverse_mod_383
+.rva	.LSEH_epilogue_ctx_inverse_mod_383
+.rva	.LSEH_info_ctx_inverse_mod_383_body
+
+.rva	.LSEH_epilogue_ctx_inverse_mod_383
+.rva	.LSEH_end_ctx_inverse_mod_383
+.rva	.LSEH_info_ctx_inverse_mod_383_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_ctx_inverse_mod_383_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_ctx_inverse_mod_383_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x8b,0x00
+.byte	0x00,0xe4,0x8c,0x00
+.byte	0x00,0xd4,0x8d,0x00
+.byte	0x00,0xc4,0x8e,0x00
+.byte	0x00,0x34,0x8f,0x00
+.byte	0x00,0x54,0x90,0x00
+.byte	0x00,0x74,0x92,0x00
+.byte	0x00,0x64,0x93,0x00
+.byte	0x00,0x01,0x91,0x00
+.LSEH_info_ctx_inverse_mod_383_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S
new file mode 100644
index 00000000000..c17b9e38336
--- /dev/null
+++ b/crypto/blst_src/build/coff/div3w-armv8.S
@@ -0,0 +1,94 @@
+.text
+
+.globl	div_3_limbs
+.def	div_3_limbs;
+.type	32;
+.endef
+.p2align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+
+.globl	quot_rem_128
+.def	quot_rem_128;
+.type	32;
+.endef
+.p2align	5
+quot_rem_128:
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+
+
+.globl	quot_rem_64
+.def	quot_rem_64;
+.type	32;
+.endef
+.p2align	5
+quot_rem_64:
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s
new file mode 100644
index 00000000000..fcfe54480be
--- /dev/null
+++ b/crypto/blst_src/build/coff/div3w-x86_64.s
@@ -0,0 +1,140 @@
+.text	
+
+.globl	div_3_limbs
+
+.def	div_3_limbs;	.scl 2;	.type 32;	.endef
+.p2align	5
+div_3_limbs:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%rax
+.LSEH_begin_div_3_limbs:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	xorq	%rax,%rax
+	movl	$64,%ecx
+
+.Loop:
+	movq	%r8,%r10
+	subq	%rsi,%r8
+	movq	%r9,%r11
+	sbbq	%rdx,%r9
+	leaq	1(%rax,%rax,1),%rax
+	movq	%rdx,%rdi
+	cmovcq	%r10,%r8
+	cmovcq	%r11,%r9
+	sbbq	$0,%rax
+	shlq	$63,%rdi
+	shrq	$1,%rsi
+	shrq	$1,%rdx
+	orq	%rdi,%rsi
+	subl	$1,%ecx
+	jnz	.Loop
+
+	leaq	1(%rax,%rax,1),%rcx
+	sarq	$63,%rax
+
+	subq	%rsi,%r8
+	sbbq	%rdx,%r9
+	sbbq	$0,%rcx
+
+	orq	%rcx,%rax
+
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	.byte	0xf3,0xc3
+.LSEH_end_div_3_limbs:
+.globl	quot_rem_128
+
+.def	quot_rem_128;	.scl 2;	.type 32;	.endef
+.p2align	5
+quot_rem_128:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%rax
+.LSEH_begin_quot_rem_128:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+	movq	%rdx,%rax
+	movq	%rdx,%rcx
+
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	adcq	$0,%rdx
+
+	movq	0(%rdi),%r10
+	movq	8(%rdi),%r11
+	movq	16(%rdi),%rax
+
+	subq	%r8,%r10
+	sbbq	%r9,%r11
+	sbbq	%rdx,%rax
+	sbbq	%r8,%r8
+
+	addq	%r8,%rcx
+	movq	%r8,%r9
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	addq	%r8,%r10
+	adcq	%r9,%r11
+
+	movq	%r10,0(%rdi)
+	movq	%r11,8(%rdi)
+	movq	%rcx,16(%rdi)
+
+	movq	%rcx,%rax
+
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	.byte	0xf3,0xc3
+.LSEH_end_quot_rem_128:
+
+
+
+
+
+.globl	quot_rem_64
+
+.def	quot_rem_64;	.scl 2;	.type 32;	.endef
+.p2align	5
+quot_rem_64:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%rax
+.LSEH_begin_quot_rem_64:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+	movq	%rdx,%rax
+	imulq	0(%rsi),%rdx
+
+	movq	0(%rdi),%r10
+
+	subq	%rdx,%r10
+
+	movq	%r10,0(%rdi)
+	movq	%rax,8(%rdi)
+
+	movq	8(%rsp),%rdi
+	movq	16(%rsp),%rsi
+	.byte	0xf3,0xc3
+.LSEH_end_quot_rem_64:
+.section	.pdata
+.p2align	2
+.section	.xdata
+.p2align	3
diff --git a/crypto/blst_src/build/coff/mul_mont_256-armv8.S b/crypto/blst_src/build/coff/mul_mont_256-armv8.S
new file mode 100644
index 00000000000..8cadbb89344
--- /dev/null
+++ b/crypto/blst_src/build/coff/mul_mont_256-armv8.S
@@ -0,0 +1,474 @@
+.text
+
+.globl	mul_mont_sparse_256
+
+.def	mul_mont_sparse_256;
+.type	32;
+.endef
+.p2align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csel	x19,x19,x14,lo
+	csel	x20,x20,x15,lo
+	csel	x21,x21,x16,lo
+	csel	x22,x22,x17,lo
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+
+.globl	sqr_mont_sparse_256
+
+.def	sqr_mont_sparse_256;
+.type	32;
+.endef
+.p2align	5
+sqr_mont_sparse_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+.globl	from_mont_256
+
+.def	from_mont_256;
+.type	32;
+.endef
+.p2align	5
+from_mont_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+.long	3573752767
+	ret
+
+
+.globl	redc_mont_256
+
+.def	redc_mont_256;
+.type	32;
+.endef
+.p2align	5
+redc_mont_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+.long	3573752767
+	ret
+
+
+.def	__mul_by_1_mont_256;
+.type	32;
+.endef
+.p2align	5
+__mul_by_1_mont_256:
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+
diff --git a/crypto/blst_src/build/coff/mul_mont_384-armv8.S b/crypto/blst_src/build/coff/mul_mont_384-armv8.S
new file mode 100644
index 00000000000..074f38c495c
--- /dev/null
+++ b/crypto/blst_src/build/coff/mul_mont_384-armv8.S
@@ -0,0 +1,2424 @@
+.text
+
+.globl	add_mod_384x384
+.def	add_mod_384x384;
+.type	32;
+.endef
+.p2align	5
+add_mod_384x384:
+.long	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+.long	3573752767
+	ret
+
+
+.def	__add_mod_384x384;
+.type	32;
+.endef
+.p2align	5
+__add_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	adcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	adcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	adcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	stp	x11,x12,[x0,#48]
+	csel	x15,x15,x23,lo
+	stp	x13,x14,[x0,#64]
+	csel	x16,x16,x24,lo
+	stp	x15,x16,[x0,#80]
+
+	ret
+
+
+.globl	sub_mod_384x384
+.def	sub_mod_384x384;
+.type	32;
+.endef
+.p2align	5
+sub_mod_384x384:
+.long	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+.long	3573752767
+	ret
+
+
+.def	__sub_mod_384x384;
+.type	32;
+.endef
+.p2align	5
+__sub_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	sbcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	sbcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	sbcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	ret
+
+
+.def	__add_mod_384;
+.type	32;
+.endef
+.p2align	5
+__add_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	stp	x11,x12,[x0]
+	csel	x16,x16,x24,lo
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+.def	__sub_mod_384;
+.type	32;
+.endef
+.p2align	5
+__sub_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+.globl	mul_mont_384x
+
+.def	mul_mont_384x;
+.type	32;
+.endef
+.p2align	5
+mul_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	x26,x0		// save r_ptr
+	mov	x27,x1		// save b_ptr
+	mov	x28,x2		// save b_ptr
+
+	sub	x0,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	x1,x1,#48	// mul_384(t1, a->im, b->im)
+	add	x2,x2,#48
+	add	x0,sp,#96
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	sub	x2,x1,#48
+	add	x0,sp,#240
+	bl	__add_mod_384
+
+	add	x1,x28,#0
+	add	x2,x28,#48
+	add	x0,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	x1,x0,#0
+	add	x2,x0,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,x0
+	add	x2,sp,#0
+	bl	__sub_mod_384x384
+
+	add	x2,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	x1,sp,#0
+	add	x2,sp,#96
+	add	x0,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	x1,sp,#0		// ret->re = redc(t0)
+	add	x0,x26,#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	x1,sp,#192		// ret->im = redc(t2)
+	add	x0,x0,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	sqr_mont_384x
+
+.def	sqr_mont_384x;
+.type	32;
+.endef
+.p2align	5
+sqr_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	add	x0,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	x0,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x11,x11,x11	// add with itself
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x19,x11,x19,lo
+	csel	x20,x12,x20,lo
+	csel	x21,x13,x21,lo
+	ldp	x11,x12,[sp]
+	csel	x22,x14,x22,lo
+	ldr	x17,        [sp,#48]
+	csel	x23,x15,x23,lo
+	ldp	x13,x14,[sp,#16]
+	csel	x24,x16,x24,lo
+	ldp	x15,x16,[sp,#32]
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	add	x2,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	mul_mont_384
+
+.def	mul_mont_384;
+.type	32;
+.endef
+.p2align	5
+mul_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.def	__mul_mont_384;
+.type	32;
+.endef
+.p2align	5
+__mul_mont_384:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	mov	x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*1]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*2]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*3]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*4]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*5]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]	// pull r_ptr
+	adc	x17,x17,xzr
+
+	adds	x19,x20,x26
+	adcs	x20,x21,x27
+	adcs	x21,x22,x28
+	adcs	x22,x23,x0
+	adcs	x23,x24,x1
+	adcs	x24,x25,x3
+	adc	x25,x17,xzr
+
+	subs	x26,x19,x5
+	sbcs	x27,x20,x6
+	sbcs	x28,x21,x7
+	sbcs	x0,x22,x8
+	sbcs	x1,x23,x9
+	sbcs	x3,x24,x10
+	sbcs	xzr,    x25,xzr
+
+	csel	x11,x19,x26,lo
+	csel	x12,x20,x27,lo
+	csel	x13,x21,x28,lo
+	csel	x14,x22,x0,lo
+	csel	x15,x23,x1,lo
+	csel	x16,x24,x3,lo
+	ret
+
+
+.globl	sqr_mont_384
+
+.def	sqr_mont_384;
+.type	32;
+.endef
+.p2align	5
+sqr_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x4,x3		// adjust for missing b_ptr
+
+	mov	x3,x0		// save r_ptr
+	mov	x0,sp
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	mov	x1,sp
+	mov	x0,x3		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	sqr_n_mul_mont_383
+
+.def	sqr_n_mul_mont_383;
+.type	32;
+.endef
+.p2align	5
+sqr_n_mul_mont_383:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x17,x5			// save b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+	mov	x0,sp
+.Loop_sqr_383:
+	bl	__sqr_384
+	sub	x2,x2,#1	// counter
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// just accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	cbnz	x2,.Loop_sqr_383
+
+	mov	x2,x17
+	ldr	x17,[x17]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+.def	__sqr_384;
+.type	32;
+.endef
+.p2align	5
+__sqr_384:
+	mul	x19,x12,x11
+	mul	x20,x13,x11
+	mul	x21,x14,x11
+	mul	x22,x15,x11
+	mul	x23,x16,x11
+
+	umulh	x6,x12,x11
+	umulh	x7,x13,x11
+	umulh	x8,x14,x11
+	umulh	x9,x15,x11
+	adds	x20,x20,x6
+	umulh	x10,x16,x11
+	adcs	x21,x21,x7
+	mul	x7,x13,x12
+	adcs	x22,x22,x8
+	mul	x8,x14,x12
+	adcs	x23,x23,x9
+	mul	x9,x15,x12
+	adc	x24,xzr,    x10
+	mul	x10,x16,x12
+
+	adds	x21,x21,x7
+	umulh	x7,x13,x12
+	adcs	x22,x22,x8
+	umulh	x8,x14,x12
+	adcs	x23,x23,x9
+	umulh	x9,x15,x12
+	adcs	x24,x24,x10
+	umulh	x10,x16,x12
+	adc	x25,xzr,xzr
+
+	mul	x5,x11,x11
+	adds	x22,x22,x7
+	umulh	x11,  x11,x11
+	adcs	x23,x23,x8
+	mul	x8,x14,x13
+	adcs	x24,x24,x9
+	mul	x9,x15,x13
+	adc	x25,x25,x10
+	mul	x10,x16,x13
+
+	adds	x23,x23,x8
+	umulh	x8,x14,x13
+	adcs	x24,x24,x9
+	umulh	x9,x15,x13
+	adcs	x25,x25,x10
+	umulh	x10,x16,x13
+	adc	x26,xzr,xzr
+
+	mul	x6,x12,x12
+	adds	x24,x24,x8
+	umulh	x12,  x12,x12
+	adcs	x25,x25,x9
+	mul	x9,x15,x14
+	adc	x26,x26,x10
+	mul	x10,x16,x14
+
+	adds	x25,x25,x9
+	umulh	x9,x15,x14
+	adcs	x26,x26,x10
+	umulh	x10,x16,x14
+	adc	x27,xzr,xzr
+	mul	x7,x13,x13
+	adds	x26,x26,x9
+	umulh	x13,  x13,x13
+	adc	x27,x27,x10
+	mul	x8,x14,x14
+
+	mul	x10,x16,x15
+	umulh	x14,  x14,x14
+	adds	x27,x27,x10
+	umulh	x10,x16,x15
+	mul	x9,x15,x15
+	adc	x28,x10,xzr
+
+	adds	x19,x19,x19
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adcs	x22,x22,x22
+	adcs	x23,x23,x23
+	adcs	x24,x24,x24
+	adcs	x25,x25,x25
+	adcs	x26,x26,x26
+	umulh	x15,  x15,x15
+	adcs	x27,x27,x27
+	mul	x10,x16,x16
+	adcs	x28,x28,x28
+	umulh	x16,  x16,x16
+	adc	x1,xzr,xzr
+
+	adds	x19,x19,x11
+	adcs	x20,x20,x6
+	adcs	x21,x21,x12
+	adcs	x22,x22,x7
+	adcs	x23,x23,x13
+	adcs	x24,x24,x8
+	adcs	x25,x25,x14
+	stp	x5,x19,[x0]
+	adcs	x26,x26,x9
+	stp	x20,x21,[x0,#16]
+	adcs	x27,x27,x15
+	stp	x22,x23,[x0,#32]
+	adcs	x28,x28,x10
+	stp	x24,x25,[x0,#48]
+	adc	x16,x16,x1
+	stp	x26,x27,[x0,#64]
+	stp	x28,x16,[x0,#80]
+
+	ret
+
+.globl	sqr_384
+
+.def	sqr_384;
+.type	32;
+.endef
+.p2align	5
+sqr_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	redc_mont_384
+
+.def	redc_mont_384;
+.type	32;
+.endef
+.p2align	5
+redc_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	from_mont_384
+
+.def	from_mont_384;
+.type	32;
+.endef
+.p2align	5
+from_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.def	__mul_by_1_mont_384;
+.type	32;
+.endef
+.p2align	5
+__mul_by_1_mont_384:
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	mul	x26,x4,x11
+	ldp	x15,x16,[x1,#32]
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	ret
+
+
+.def	__redc_tail_mont_384;
+.type	32;
+.endef
+.p2align	5
+__redc_tail_mont_384:
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+.globl	mul_384
+
+.def	mul_384;
+.type	32;
+.endef
+.p2align	5
+mul_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.def	__mul_384;
+.type	32;
+.endef
+.p2align	5
+__mul_384:
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+
+	umulh	x5,x11,x17
+	umulh	x6,x12,x17
+	umulh	x7,x13,x17
+	umulh	x8,x14,x17
+	umulh	x9,x15,x17
+	umulh	x10,x16,x17
+	ldr	x17,[x2,8*1]
+
+	str	x19,[x0]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,xzr,    x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(1+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*1]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(2+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*2]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(3+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*3]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(4+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*4]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*5]
+	adds	x19,x20,x5
+	adcs	x20,x21,x6
+	adcs	x21,x22,x7
+	adcs	x22,x23,x8
+	adcs	x23,x24,x9
+	adc	x24,x25,x10
+
+	stp	x19,x20,[x0,#48]
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ret
+
+
+.globl	mul_382x
+
+.def	mul_382x;
+.type	32;
+.endef
+.p2align	5
+mul_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	x11,x12,[x1]
+	mov	x26,x0		// save r_ptr
+	ldp	x19,x20,[x1,#48]
+	mov	x27,x1		// save a_ptr
+	ldp	x13,x14,[x1,#16]
+	mov	x28,x2		// save b_ptr
+	ldp	x21,x22,[x1,#64]
+	ldp	x15,x16,[x1,#32]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x23,x24,[x1,#80]
+	adcs	x6,x12,x20
+	ldp	x11,x12,[x2]
+	adcs	x7,x13,x21
+	ldp	x19,x20,[x2,#48]
+	adcs	x8,x14,x22
+	ldp	x13,x14,[x2,#16]
+	adcs	x9,x15,x23
+	ldp	x21,x22,[x2,#64]
+	adc	x10,x16,x24
+	ldp	x15,x16,[x2,#32]
+
+	stp	x5,x6,[sp]
+	adds	x5,x11,x19	// t1 = b->re + b->im
+	ldp	x23,x24,[x2,#80]
+	adcs	x6,x12,x20
+	stp	x7,x8,[sp,#16]
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	stp	x9,x10,[sp,#32]
+	adcs	x9,x15,x23
+	stp	x5,x6,[sp,#48]
+	adc	x10,x16,x24
+	stp	x7,x8,[sp,#64]
+	stp	x9,x10,[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	x1,sp,#0		// mul_384(ret->im, t0, t1)
+	add	x2,sp,#48
+	add	x0,x26,#96
+	bl	__mul_384
+
+	add	x1,x27,#48	// mul_384(tx, a->im, b->im)
+	add	x2,x28,#48
+	add	x0,sp,#0
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	add	x1,x26,#96	// ret->im -= tx
+	add	x2,sp,#0
+	add	x0,x26,#96
+	bl	__sub_mod_384x384
+
+	add	x2,x26,#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	x1,x26,#0	// ret->re -= tx
+	add	x2,sp,#0
+	add	x0,x26,#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	sqr_382x
+
+.def	sqr_382x;
+.type	32;
+.endef
+.p2align	5
+sqr_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x19,x20,[x1,#48]
+	ldp	x13,x14,[x1,#16]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x21,x22,[x1,#64]
+	adcs	x6,x12,x20
+	ldp	x15,x16,[x1,#32]
+	adcs	x7,x13,x21
+	ldp	x23,x24,[x1,#80]
+	adcs	x8,x14,x22
+	stp	x5,x6,[x0]
+	adcs	x9,x15,x23
+	ldp	x5,x6,[x2]
+	adc	x10,x16,x24
+	stp	x7,x8,[x0,#16]
+
+	subs	x11,x11,x19	// t1 = a->re - a->im
+	ldp	x7,x8,[x2,#16]
+	sbcs	x12,x12,x20
+	stp	x9,x10,[x0,#32]
+	sbcs	x13,x13,x21
+	ldp	x9,x10,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	adds	x11,x11,x19
+	and	x21,x7,x25
+	adcs	x12,x12,x20
+	and	x22,x8,x25
+	adcs	x13,x13,x21
+	and	x23,x9,x25
+	adcs	x14,x14,x22
+	and	x24,x10,x25
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	mov	x4,x1		// save a_ptr
+	add	x1,x0,#0	// mul_384(ret->re, t0, t1)
+	add	x2,x0,#48
+	bl	__mul_384
+
+	add	x1,x4,#0		// mul_384(ret->im, a->re, a->im)
+	add	x2,x4,#48
+	add	x0,x0,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x11,x12,[x0]
+	ldp	x13,x14,[x0,#16]
+	adds	x11,x11,x11	// add with itself
+	ldp	x15,x16,[x0,#32]
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adcs	x19,x19,x19
+	adcs	x20,x20,x20
+	stp	x11,x12,[x0]
+	adcs	x21,x21,x21
+	stp	x13,x14,[x0,#16]
+	adcs	x22,x22,x22
+	stp	x15,x16,[x0,#32]
+	adcs	x23,x23,x23
+	stp	x19,x20,[x0,#48]
+	adc	x24,x24,x24
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	sqr_mont_382x
+
+.def	sqr_mont_382x;
+.type	32;
+.endef
+.p2align	5
+sqr_mont_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x17,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x5,x11,x17	// t0 = a->re + a->im
+	adcs	x6,x12,x20
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	adcs	x9,x15,x23
+	adc	x10,x16,x24
+
+	subs	x19,x11,x17	// t1 = a->re - a->im
+	sbcs	x20,x12,x20
+	sbcs	x21,x13,x21
+	sbcs	x22,x14,x22
+	sbcs	x23,x15,x23
+	sbcs	x24,x16,x24
+	sbc	x25,xzr,xzr		// borrow flag as mask
+
+	stp	x5,x6,[sp]
+	stp	x7,x8,[sp,#16]
+	stp	x9,x10,[sp,#32]
+	stp	x19,x20,[sp,#48]
+	stp	x21,x22,[sp,#64]
+	stp	x23,x24,[sp,#80]
+	str	x25,[sp,#96]
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x19,x11,x11	// add with itself
+	adcs	x20,x12,x12
+	adcs	x21,x13,x13
+	adcs	x22,x14,x14
+	adcs	x23,x15,x15
+	adc	x24,x16,x16
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	ldp	x11,x12,[sp]
+	ldr	x17,[sp,#48]
+	ldp	x13,x14,[sp,#16]
+	ldp	x15,x16,[sp,#32]
+
+	add	x2,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	x25,[sp,#96]	// account for sign from a->re - a->im
+	ldp	x19,x20,[sp]
+	ldp	x21,x22,[sp,#16]
+	ldp	x23,x24,[sp,#32]
+
+	and	x19,x19,x25
+	and	x20,x20,x25
+	and	x21,x21,x25
+	and	x22,x22,x25
+	and	x23,x23,x25
+	and	x24,x24,x25
+
+	subs	x11,x11,x19
+	sbcs	x12,x12,x20
+	sbcs	x13,x13,x21
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	and	x21,x7,x25
+	and	x22,x8,x25
+	and	x23,x9,x25
+	and	x24,x10,x25
+
+	adds	x11,x11,x19
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.def	__mul_mont_383_nonred;
+.type	32;
+.endef
+.p2align	5
+__mul_mont_383_nonred:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*1]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*2]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*3]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*4]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*5]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]		// pull r_ptr
+
+	adds	x11,x20,x26
+	adcs	x12,x21,x27
+	adcs	x13,x22,x28
+	adcs	x14,x23,x0
+	adcs	x15,x24,x1
+	adcs	x16,x25,x3
+
+	ret
+
+
+.globl	sgn0_pty_mont_384
+
+.def	sgn0_pty_mont_384;
+.type	32;
+.endef
+.p2align	5
+sgn0_pty_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	adds	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	sgn0_pty_mont_384x
+
+.def	sgn0_pty_mont_384x;
+.type	32;
+.endef
+.p2align	5
+sgn0_pty_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	add	x1,x1,#48
+
+	and	x2,x11,#1
+	orr	x3,x11,x12
+	adds	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	orr	x3,x3,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x2,x2,x17
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	orr	x1,x11,x12
+	adds	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	orr	x1,x1,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
new file mode 100644
index 00000000000..dd1e00fa301
--- /dev/null
+++ b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
@@ -0,0 +1,872 @@
+.text	
+
+.globl	mul_mont_sparse_256
+
+.def	mul_mont_sparse_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_mont_sparse_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_mul_mont_sparse_256:
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r13
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%rbp
+	movq	%rdx,%rbx
+
+	movq	%rax,%r15
+	mulq	%r13
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mul_mont_sparse_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_mont_sparse_256:
+
+.globl	sqr_mont_sparse_256
+
+.def	sqr_mont_sparse_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_mont_sparse_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_sqr_mont_sparse_256:
+
+
+	movq	0(%rsi),%rax
+	movq	%rcx,%r8
+	movq	8(%rsi),%r14
+	movq	%rdx,%rcx
+	movq	16(%rsi),%r12
+	leaq	(%rsi),%rbx
+	movq	24(%rsi),%rbp
+
+	movq	%rax,%r15
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqr_mont_sparse_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_mont_sparse_256:
+.def	__mulq_mont_sparse_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	xorq	%r14,%r14
+	movq	%rdx,%r13
+
+	movq	%r9,%rdi
+	imulq	%r8,%r9
+
+
+	movq	%rax,%r15
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	xorq	%r15,%r15
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r9,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rdx,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	movq	%r10,%rdi
+	imulq	%r8,%r10
+
+
+	movq	%rax,%r9
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	xorq	%r9,%r9
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r10,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rdx,%r14
+	adcq	$0,%r15
+	adcq	$0,%r9
+	movq	%r11,%rdi
+	imulq	%r8,%r11
+
+
+	movq	%rax,%r10
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r9
+	xorq	%r10,%r10
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r11,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rdx,%r15
+	adcq	$0,%r9
+	adcq	$0,%r10
+	imulq	%r8,%rax
+	movq	8(%rsp),%rsi
+
+
+	movq	%rax,%r11
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	movq	%r14,%rbx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+
+	movq	%r15,%r12
+	subq	0(%rcx),%r13
+	sbbq	8(%rcx),%r14
+	sbbq	16(%rcx),%r15
+	movq	%r9,%rbp
+	sbbq	24(%rcx),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rax,%r13
+	cmovcq	%rbx,%r14
+	cmovcq	%r12,%r15
+	movq	%r13,0(%rsi)
+	cmovcq	%rbp,%r9
+	movq	%r14,8(%rsi)
+	movq	%r15,16(%rsi)
+	movq	%r9,24(%rsi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	from_mont_256
+
+.def	from_mont_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+from_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_from_mont_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_from_mont_256:
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	movq	%r14,%r10
+	movq	%r15,%r11
+	movq	%r9,%r12
+
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	sbbq	24(%rbx),%r9
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_from_mont_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_from_mont_256:
+
+.globl	redc_mont_256
+
+.def	redc_mont_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+redc_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_redc_mont_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_redc_mont_256:
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+	addq	32(%rsi),%r13
+	adcq	40(%rsi),%r14
+	movq	%r13,%rax
+	adcq	48(%rsi),%r15
+	movq	%r14,%r10
+	adcq	56(%rsi),%r9
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r15,%r11
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	movq	%r9,%r12
+	sbbq	24(%rbx),%r9
+	sbbq	$0,%rsi
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_redc_mont_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_redc_mont_256:
+.def	__mulq_by_1_mont_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_by_1_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	movq	%rax,%r13
+	imulq	%rcx,%rax
+	movq	%rax,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	16(%rbx)
+	movq	%r10,%r14
+	imulq	%rcx,%r10
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r9
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_mul_mont_sparse_256
+.rva	.LSEH_body_mul_mont_sparse_256
+.rva	.LSEH_info_mul_mont_sparse_256_prologue
+
+.rva	.LSEH_body_mul_mont_sparse_256
+.rva	.LSEH_epilogue_mul_mont_sparse_256
+.rva	.LSEH_info_mul_mont_sparse_256_body
+
+.rva	.LSEH_epilogue_mul_mont_sparse_256
+.rva	.LSEH_end_mul_mont_sparse_256
+.rva	.LSEH_info_mul_mont_sparse_256_epilogue
+
+.rva	.LSEH_begin_sqr_mont_sparse_256
+.rva	.LSEH_body_sqr_mont_sparse_256
+.rva	.LSEH_info_sqr_mont_sparse_256_prologue
+
+.rva	.LSEH_body_sqr_mont_sparse_256
+.rva	.LSEH_epilogue_sqr_mont_sparse_256
+.rva	.LSEH_info_sqr_mont_sparse_256_body
+
+.rva	.LSEH_epilogue_sqr_mont_sparse_256
+.rva	.LSEH_end_sqr_mont_sparse_256
+.rva	.LSEH_info_sqr_mont_sparse_256_epilogue
+
+.rva	.LSEH_begin_from_mont_256
+.rva	.LSEH_body_from_mont_256
+.rva	.LSEH_info_from_mont_256_prologue
+
+.rva	.LSEH_body_from_mont_256
+.rva	.LSEH_epilogue_from_mont_256
+.rva	.LSEH_info_from_mont_256_body
+
+.rva	.LSEH_epilogue_from_mont_256
+.rva	.LSEH_end_from_mont_256
+.rva	.LSEH_info_from_mont_256_epilogue
+
+.rva	.LSEH_begin_redc_mont_256
+.rva	.LSEH_body_redc_mont_256
+.rva	.LSEH_info_redc_mont_256_prologue
+
+.rva	.LSEH_body_redc_mont_256
+.rva	.LSEH_epilogue_redc_mont_256
+.rva	.LSEH_info_redc_mont_256_body
+
+.rva	.LSEH_epilogue_redc_mont_256
+.rva	.LSEH_end_redc_mont_256
+.rva	.LSEH_info_redc_mont_256_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_mul_mont_sparse_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_mont_sparse_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mul_mont_sparse_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_mont_sparse_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_mont_sparse_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqr_mont_sparse_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_from_mont_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_from_mont_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_from_mont_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_redc_mont_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_redc_mont_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_redc_mont_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
new file mode 100644
index 00000000000..5663463524a
--- /dev/null
+++ b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
@@ -0,0 +1,4206 @@
+.text	
+
+
+
+
+
+
+
+.def	__sub_mod_384x384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__add_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__add_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__sub_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	mul_mont_384x
+
+.def	mul_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$328,%rsp
+
+.LSEH_body_mul_mont_384x:
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	48(%rsi),%rsi
+	leaq	40+96(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	-48(%rsi),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulq_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	movq	%rcx,%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_mul_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_mont_384x:
+.globl	sqr_mont_384x
+
+.def	sqr_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqr_mont_384x:
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	call	__mulq_mont_384
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	movq	%r14,%r12
+	adcq	%r9,%r9
+	movq	%r15,%r13
+	adcq	%r10,%r10
+	movq	%r8,%rax
+	adcq	%r11,%r11
+	movq	%r9,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r10,%rbp
+	sbbq	16(%rcx),%r8
+	sbbq	24(%rcx),%r9
+	sbbq	32(%rcx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r14
+	cmovcq	%r13,%r15
+	cmovcq	%rax,%r8
+	movq	%r14,48(%rdi)
+	cmovcq	%rbx,%r9
+	movq	%r15,56(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,64(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqr_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_mont_384x:
+
+.globl	mul_382x
+
+.def	mul_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_mul_382x:
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulq_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_mul_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_382x:
+.globl	sqr_382x
+
+.def	sqr_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rsi
+
+.LSEH_body_sqr_382x:
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulq_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulq_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqr_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_382x:
+.globl	mul_384
+
+.def	mul_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+.LSEH_body_mul_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulq_384
+
+	movq	0(%rsp),%r12
+
+	movq	8(%rsp),%rbx
+
+	movq	16(%rsp),%rbp
+
+	leaq	24(%rsp),%rsp
+
+.LSEH_epilogue_mul_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_384:
+
+.def	__mulq_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rax
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	movq	%rax,0(%rdi)
+	movq	%rbp,%rax
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,16(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,24(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,32(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,40(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	%rax,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rcx,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+	movq	%r10,72(%rdi)
+	movq	%r11,80(%rdi)
+	movq	%r12,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	sqr_384
+
+.def	sqr_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sqr_384:
+
+
+	call	__sqrq_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqr_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_384:
+
+.def	__sqrq_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sqrq_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rbx
+
+
+	movq	%rax,%r14
+	mulq	%r15
+	movq	%rax,%r9
+	movq	%r14,%rax
+	movq	32(%rsi),%rbp
+	movq	%rdx,%r10
+
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	40(%rsi),%rsi
+	movq	%rdx,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	%rax
+	xorq	%r8,%r8
+	movq	%rax,0(%rdi)
+	movq	%r15,%rax
+	addq	%r9,%r9
+	adcq	$0,%r8
+	addq	%rdx,%r9
+	adcq	$0,%r8
+	movq	%r9,8(%rdi)
+
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rax
+	xorq	%r9,%r9
+	addq	%rax,%r8
+	movq	%rcx,%rax
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%r9
+	addq	%r8,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r9
+	movq	%r10,16(%rdi)
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rbp
+	addq	%rax,%r14
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%rbx,%rax
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	$0,%r11
+	addq	%r9,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r11
+	movq	%r12,32(%rdi)
+
+
+	mulq	%rbp
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r13,40(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	mulq	%rax
+	xorq	%r12,%r12
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r12
+	addq	%r11,%r14
+	adcq	%rdx,%r15
+	movq	%r14,48(%rdi)
+	adcq	$0,%r12
+	movq	%r15,56(%rdi)
+
+
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	xorq	%r13,%r13
+	addq	%rax,%r12
+	movq	%rsi,%rax
+	addq	%rcx,%rcx
+	adcq	%rbx,%rbx
+	adcq	$0,%r13
+	addq	%r12,%rcx
+	adcq	%rdx,%rbx
+	movq	%rcx,64(%rdi)
+	adcq	$0,%r13
+	movq	%rbx,72(%rdi)
+
+
+	mulq	%rax
+	addq	%r13,%rax
+	addq	%rbp,%rbp
+	adcq	$0,%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rax,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	sqr_mont_384
+
+.def	sqr_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$120,%rsp
+
+.LSEH_body_sqr_mont_384:
+
+
+	movq	%rcx,96(%rsp)
+	movq	%rdx,104(%rsp)
+	movq	%rdi,112(%rsp)
+
+	movq	%rsp,%rdi
+	call	__sqrq_384
+
+	leaq	0(%rsp),%rsi
+	movq	96(%rsp),%rcx
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	120(%rsp),%r8
+	movq	120(%rsp),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqr_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_mont_384:
+
+
+
+.globl	redc_mont_384
+
+.def	redc_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+redc_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_redc_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_redc_mont_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_redc_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_redc_mont_384:
+
+
+
+
+.globl	from_mont_384
+
+.def	from_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+from_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_from_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_from_mont_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+
+
+
+
+
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_from_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_from_mont_384:
+.def	__mulq_by_1_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_by_1_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%r9,%r15
+	imulq	%rcx,%r9
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	32(%rbx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	40(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%r10,%r8
+	imulq	%rcx,%r10
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r8
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r11,%r9
+	imulq	%rcx,%r11
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+
+
+.def	__redc_tail_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__redc_tail_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	sgn0_pty_mont_384
+
+.def	sgn0_pty_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0_pty_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0_pty_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sgn0_pty_mont_384:
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sgn0_pty_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0_pty_mont_384:
+
+.globl	sgn0_pty_mont_384x
+
+.def	sgn0_pty_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0_pty_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0_pty_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sgn0_pty_mont_384x:
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sgn0_pty_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0_pty_mont_384x:
+.globl	mul_mont_384
+
+.def	mul_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mul_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mul_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$24,%rsp
+
+.LSEH_body_mul_mont_384:
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	%rdx,%rbx
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+
+	call	__mulq_mont_384
+
+	movq	24(%rsp),%r15
+
+	movq	32(%rsp),%r14
+
+	movq	40(%rsp),%r13
+
+	movq	48(%rsp),%r12
+
+	movq	56(%rsp),%rbx
+
+	movq	64(%rsp),%rbp
+
+	leaq	72(%rsp),%rsp
+
+.LSEH_epilogue_mul_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mul_mont_384:
+.def	__mulq_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rdi
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rdi,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%rbp
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r15,%r15
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r8,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	adcq	$0,%r15
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r9,%rbp
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r9,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r10,%rbp
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r10,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r11,%rbp
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r11,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r12,%rbp
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r12,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	movq	%r13,%rbp
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rsi)
+	addq	%r12,%r10
+	adcq	$0,%rdx
+	xorq	%r12,%r12
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r13,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+
+
+
+	movq	16(%rsp),%rdi
+	subq	0(%rcx),%r14
+	movq	%r15,%rdx
+	sbbq	8(%rcx),%r15
+	movq	%r8,%rbx
+	sbbq	16(%rcx),%r8
+	movq	%r9,%rsi
+	sbbq	24(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	32(%rcx),%r10
+	movq	%r11,%r13
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rdx,%r15
+	cmovcq	%rbx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rsi,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%r13,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	sqr_n_mul_mont_384
+
+.def	sqr_n_mul_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_n_mul_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_n_mul_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+	movq	48(%rsp),%r9
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqr_n_mul_mont_384:
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_384:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1,%edx
+	leaq	0(%rdi),%rsi
+	decl	%edx
+	jnz	.Loop_sqr_384
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqr_n_mul_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_n_mul_mont_384:
+
+.globl	sqr_n_mul_mont_383
+
+.def	sqr_n_mul_mont_383;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_n_mul_mont_383:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_n_mul_mont_383:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+	movq	48(%rsp),%r9
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqr_n_mul_mont_383:
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_383:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1,%edx
+	addq	48(%rsi),%r14
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	leaq	0(%rdi),%rsi
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	decl	%edx
+	jnz	.Loop_sqr_383
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqr_n_mul_mont_383:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_n_mul_mont_383:
+.def	__mulq_mont_383_nonred;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulq_mont_383_nonred:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rbp
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%r15
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%r15
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rcx)
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	%rdx,%r14
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rsi)
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rcx)
+	addq	%rax,%r8
+	movq	%r9,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rcx)
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	%rdx,%r15
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rcx)
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	%rdx,%r8
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rcx)
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	%rdx,%r9
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rcx)
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	%rdx,%r10
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r13,%r12
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	24(%rcx)
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	adcq	%rdx,%r11
+	.byte	0xf3,0xc3
+
+.globl	sqr_mont_382x
+
+.def	sqr_mont_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqr_mont_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqr_mont_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqr_mont_382x:
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rsi,16(%rsp)
+	movq	%rdi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	24(%rsp),%rdi
+	call	__mulq_mont_383_nonred
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_383_nonred
+	movq	32+96(%rsp),%rsi
+	movq	32+0(%rsp),%r12
+	movq	32+8(%rsp),%r13
+	andq	%rsi,%r12
+	movq	32+16(%rsp),%rax
+	andq	%rsi,%r13
+	movq	32+24(%rsp),%rbx
+	andq	%rsi,%rax
+	movq	32+32(%rsp),%rbp
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	32+40(%rsp),%rsi
+
+	subq	%r12,%r14
+	movq	0(%rcx),%r12
+	sbbq	%r13,%r15
+	movq	8(%rcx),%r13
+	sbbq	%rax,%r8
+	movq	16(%rcx),%rax
+	sbbq	%rbx,%r9
+	movq	24(%rcx),%rbx
+	sbbq	%rbp,%r10
+	movq	32(%rcx),%rbp
+	sbbq	%rsi,%r11
+	sbbq	%rsi,%rsi
+
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r12,%r14
+	adcq	%r13,%r15
+	adcq	%rax,%r8
+	adcq	%rbx,%r9
+	adcq	%rbp,%r10
+	adcq	%rsi,%r11
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqr_mont_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqr_mont_382x:
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_mul_mont_384x
+.rva	.LSEH_body_mul_mont_384x
+.rva	.LSEH_info_mul_mont_384x_prologue
+
+.rva	.LSEH_body_mul_mont_384x
+.rva	.LSEH_epilogue_mul_mont_384x
+.rva	.LSEH_info_mul_mont_384x_body
+
+.rva	.LSEH_epilogue_mul_mont_384x
+.rva	.LSEH_end_mul_mont_384x
+.rva	.LSEH_info_mul_mont_384x_epilogue
+
+.rva	.LSEH_begin_sqr_mont_384x
+.rva	.LSEH_body_sqr_mont_384x
+.rva	.LSEH_info_sqr_mont_384x_prologue
+
+.rva	.LSEH_body_sqr_mont_384x
+.rva	.LSEH_epilogue_sqr_mont_384x
+.rva	.LSEH_info_sqr_mont_384x_body
+
+.rva	.LSEH_epilogue_sqr_mont_384x
+.rva	.LSEH_end_sqr_mont_384x
+.rva	.LSEH_info_sqr_mont_384x_epilogue
+
+.rva	.LSEH_begin_mul_382x
+.rva	.LSEH_body_mul_382x
+.rva	.LSEH_info_mul_382x_prologue
+
+.rva	.LSEH_body_mul_382x
+.rva	.LSEH_epilogue_mul_382x
+.rva	.LSEH_info_mul_382x_body
+
+.rva	.LSEH_epilogue_mul_382x
+.rva	.LSEH_end_mul_382x
+.rva	.LSEH_info_mul_382x_epilogue
+
+.rva	.LSEH_begin_sqr_382x
+.rva	.LSEH_body_sqr_382x
+.rva	.LSEH_info_sqr_382x_prologue
+
+.rva	.LSEH_body_sqr_382x
+.rva	.LSEH_epilogue_sqr_382x
+.rva	.LSEH_info_sqr_382x_body
+
+.rva	.LSEH_epilogue_sqr_382x
+.rva	.LSEH_end_sqr_382x
+.rva	.LSEH_info_sqr_382x_epilogue
+
+.rva	.LSEH_begin_mul_384
+.rva	.LSEH_body_mul_384
+.rva	.LSEH_info_mul_384_prologue
+
+.rva	.LSEH_body_mul_384
+.rva	.LSEH_epilogue_mul_384
+.rva	.LSEH_info_mul_384_body
+
+.rva	.LSEH_epilogue_mul_384
+.rva	.LSEH_end_mul_384
+.rva	.LSEH_info_mul_384_epilogue
+
+.rva	.LSEH_begin_sqr_384
+.rva	.LSEH_body_sqr_384
+.rva	.LSEH_info_sqr_384_prologue
+
+.rva	.LSEH_body_sqr_384
+.rva	.LSEH_epilogue_sqr_384
+.rva	.LSEH_info_sqr_384_body
+
+.rva	.LSEH_epilogue_sqr_384
+.rva	.LSEH_end_sqr_384
+.rva	.LSEH_info_sqr_384_epilogue
+
+.rva	.LSEH_begin_sqr_mont_384
+.rva	.LSEH_body_sqr_mont_384
+.rva	.LSEH_info_sqr_mont_384_prologue
+
+.rva	.LSEH_body_sqr_mont_384
+.rva	.LSEH_epilogue_sqr_mont_384
+.rva	.LSEH_info_sqr_mont_384_body
+
+.rva	.LSEH_epilogue_sqr_mont_384
+.rva	.LSEH_end_sqr_mont_384
+.rva	.LSEH_info_sqr_mont_384_epilogue
+
+.rva	.LSEH_begin_redc_mont_384
+.rva	.LSEH_body_redc_mont_384
+.rva	.LSEH_info_redc_mont_384_prologue
+
+.rva	.LSEH_body_redc_mont_384
+.rva	.LSEH_epilogue_redc_mont_384
+.rva	.LSEH_info_redc_mont_384_body
+
+.rva	.LSEH_epilogue_redc_mont_384
+.rva	.LSEH_end_redc_mont_384
+.rva	.LSEH_info_redc_mont_384_epilogue
+
+.rva	.LSEH_begin_from_mont_384
+.rva	.LSEH_body_from_mont_384
+.rva	.LSEH_info_from_mont_384_prologue
+
+.rva	.LSEH_body_from_mont_384
+.rva	.LSEH_epilogue_from_mont_384
+.rva	.LSEH_info_from_mont_384_body
+
+.rva	.LSEH_epilogue_from_mont_384
+.rva	.LSEH_end_from_mont_384
+.rva	.LSEH_info_from_mont_384_epilogue
+
+.rva	.LSEH_begin_sgn0_pty_mont_384
+.rva	.LSEH_body_sgn0_pty_mont_384
+.rva	.LSEH_info_sgn0_pty_mont_384_prologue
+
+.rva	.LSEH_body_sgn0_pty_mont_384
+.rva	.LSEH_epilogue_sgn0_pty_mont_384
+.rva	.LSEH_info_sgn0_pty_mont_384_body
+
+.rva	.LSEH_epilogue_sgn0_pty_mont_384
+.rva	.LSEH_end_sgn0_pty_mont_384
+.rva	.LSEH_info_sgn0_pty_mont_384_epilogue
+
+.rva	.LSEH_begin_sgn0_pty_mont_384x
+.rva	.LSEH_body_sgn0_pty_mont_384x
+.rva	.LSEH_info_sgn0_pty_mont_384x_prologue
+
+.rva	.LSEH_body_sgn0_pty_mont_384x
+.rva	.LSEH_epilogue_sgn0_pty_mont_384x
+.rva	.LSEH_info_sgn0_pty_mont_384x_body
+
+.rva	.LSEH_epilogue_sgn0_pty_mont_384x
+.rva	.LSEH_end_sgn0_pty_mont_384x
+.rva	.LSEH_info_sgn0_pty_mont_384x_epilogue
+
+.rva	.LSEH_begin_mul_mont_384
+.rva	.LSEH_body_mul_mont_384
+.rva	.LSEH_info_mul_mont_384_prologue
+
+.rva	.LSEH_body_mul_mont_384
+.rva	.LSEH_epilogue_mul_mont_384
+.rva	.LSEH_info_mul_mont_384_body
+
+.rva	.LSEH_epilogue_mul_mont_384
+.rva	.LSEH_end_mul_mont_384
+.rva	.LSEH_info_mul_mont_384_epilogue
+
+.rva	.LSEH_begin_sqr_n_mul_mont_384
+.rva	.LSEH_body_sqr_n_mul_mont_384
+.rva	.LSEH_info_sqr_n_mul_mont_384_prologue
+
+.rva	.LSEH_body_sqr_n_mul_mont_384
+.rva	.LSEH_epilogue_sqr_n_mul_mont_384
+.rva	.LSEH_info_sqr_n_mul_mont_384_body
+
+.rva	.LSEH_epilogue_sqr_n_mul_mont_384
+.rva	.LSEH_end_sqr_n_mul_mont_384
+.rva	.LSEH_info_sqr_n_mul_mont_384_epilogue
+
+.rva	.LSEH_begin_sqr_n_mul_mont_383
+.rva	.LSEH_body_sqr_n_mul_mont_383
+.rva	.LSEH_info_sqr_n_mul_mont_383_prologue
+
+.rva	.LSEH_body_sqr_n_mul_mont_383
+.rva	.LSEH_epilogue_sqr_n_mul_mont_383
+.rva	.LSEH_info_sqr_n_mul_mont_383_body
+
+.rva	.LSEH_epilogue_sqr_n_mul_mont_383
+.rva	.LSEH_end_sqr_n_mul_mont_383
+.rva	.LSEH_info_sqr_n_mul_mont_383_epilogue
+
+.rva	.LSEH_begin_sqr_mont_382x
+.rva	.LSEH_body_sqr_mont_382x
+.rva	.LSEH_info_sqr_mont_382x_prologue
+
+.rva	.LSEH_body_sqr_mont_382x
+.rva	.LSEH_epilogue_sqr_mont_382x
+.rva	.LSEH_info_sqr_mont_382x_body
+
+.rva	.LSEH_epilogue_sqr_mont_382x
+.rva	.LSEH_end_sqr_mont_382x
+.rva	.LSEH_info_sqr_mont_382x_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_mul_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_mont_384x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x29,0x00
+.byte	0x00,0xe4,0x2a,0x00
+.byte	0x00,0xd4,0x2b,0x00
+.byte	0x00,0xc4,0x2c,0x00
+.byte	0x00,0x34,0x2d,0x00
+.byte	0x00,0x54,0x2e,0x00
+.byte	0x00,0x74,0x30,0x00
+.byte	0x00,0x64,0x31,0x00
+.byte	0x00,0x01,0x2f,0x00
+.LSEH_info_mul_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_mont_384x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqr_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_382x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_mul_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_382x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqr_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_384_body:
+.byte	1,0,11,0
+.byte	0x00,0xc4,0x00,0x00
+.byte	0x00,0x34,0x01,0x00
+.byte	0x00,0x54,0x02,0x00
+.byte	0x00,0x74,0x04,0x00
+.byte	0x00,0x64,0x05,0x00
+.byte	0x00,0x22
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.LSEH_info_mul_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqr_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_mont_384_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x0f,0x00
+.byte	0x00,0xe4,0x10,0x00
+.byte	0x00,0xd4,0x11,0x00
+.byte	0x00,0xc4,0x12,0x00
+.byte	0x00,0x34,0x13,0x00
+.byte	0x00,0x54,0x14,0x00
+.byte	0x00,0x74,0x16,0x00
+.byte	0x00,0x64,0x17,0x00
+.byte	0x00,0x01,0x15,0x00
+.LSEH_info_sqr_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_redc_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_redc_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_redc_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_from_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_from_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_from_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0_pty_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0_pty_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sgn0_pty_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0_pty_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0_pty_mont_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sgn0_pty_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mul_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mul_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x03,0x00
+.byte	0x00,0xe4,0x04,0x00
+.byte	0x00,0xd4,0x05,0x00
+.byte	0x00,0xc4,0x06,0x00
+.byte	0x00,0x34,0x07,0x00
+.byte	0x00,0x54,0x08,0x00
+.byte	0x00,0x74,0x0a,0x00
+.byte	0x00,0x64,0x0b,0x00
+.byte	0x00,0x82
+.byte	0x00,0x00
+.LSEH_info_mul_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_n_mul_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_n_mul_mont_384_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqr_n_mul_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_n_mul_mont_383_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_n_mul_mont_383_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqr_n_mul_mont_383_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqr_mont_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqr_mont_382x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqr_mont_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
new file mode 100644
index 00000000000..75c7e82bc1a
--- /dev/null
+++ b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
@@ -0,0 +1,784 @@
+.text	
+
+.globl	mulx_mont_sparse_256
+
+.def	mulx_mont_sparse_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+mulx_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mulx_mont_sparse_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_mulx_mont_sparse_256:
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_mulx_mont_sparse_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mulx_mont_sparse_256:
+
+.globl	sqrx_mont_sparse_256
+
+.def	sqrx_mont_sparse_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_mont_sparse_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sqrx_mont_sparse_256:
+
+
+	movq	%rsi,%rbx
+	movq	%rcx,%r8
+	movq	%rdx,%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_mont_sparse_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_mont_sparse_256:
+.def	__mulx_mont_sparse_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_mont_sparse_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulxq	%r15,%r15,%r12
+	mulxq	%rbp,%rbp,%r13
+	addq	%r15,%r11
+	mulxq	%r9,%r9,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rbp,%r12
+	adcq	%r9,%r13
+	adcq	$0,%r14
+
+	movq	%rax,%r10
+	imulq	%r8,%rax
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r11
+	adcxq	%r9,%r12
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r14
+	adcxq	%r15,%r9
+	adoxq	%r9,%r15
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r10
+	adoxq	%r11,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r12
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r12
+	adoxq	%r9,%r13
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	16(%rbx),%rdx
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+	adcxq	%r10,%r14
+	adoxq	%r10,%r15
+	adcxq	%r10,%r15
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+	movq	%rax,%r11
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r15
+	adcxq	%r10,%r9
+	adoxq	%r9,%r10
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r11
+	adoxq	%r12,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r13
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	24(%rbx),%rdx
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+	adcxq	%r11,%r15
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	adoxq	%r11,%r11
+	adcq	$0,%r11
+	movq	%rax,%r12
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r10
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r10
+	adcxq	%r11,%r9
+	adoxq	%r9,%r11
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r12
+	adoxq	%r13,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r14
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%rax,%rdx
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	adoxq	%r12,%r12
+	adcq	$0,%r12
+	imulq	%r8,%rdx
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rcx),%r13,%r9
+	adcxq	%rax,%r13
+	adoxq	%r9,%r14
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%r14,%rdx
+	leaq	128(%rcx),%rcx
+	adcxq	%rbp,%r10
+	adoxq	%r9,%r11
+	movq	%r15,%rax
+	adcxq	%r13,%r11
+	adoxq	%r13,%r12
+	adcq	$0,%r12
+
+
+
+
+	movq	%r10,%rbp
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	sbbq	16(%rcx),%r10
+	movq	%r11,%r9
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rdx,%r14
+	cmovcq	%rax,%r15
+	cmovcq	%rbp,%r10
+	movq	%r14,0(%rdi)
+	cmovcq	%r9,%r11
+	movq	%r15,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	fromx_mont_256
+
+.def	fromx_mont_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+fromx_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_fromx_mont_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_fromx_mont_256:
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	movq	%r15,%rdx
+	movq	%r10,%r12
+	movq	%r11,%r13
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_fromx_mont_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_fromx_mont_256:
+
+.globl	redcx_mont_256
+
+.def	redcx_mont_256;	.scl 2;	.type 32;	.endef
+.p2align	5
+redcx_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_redcx_mont_256:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_redcx_mont_256:
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+	addq	32(%rsi),%r14
+	adcq	40(%rsi),%r15
+	movq	%r14,%rax
+	adcq	48(%rsi),%r10
+	movq	%r15,%rdx
+	adcq	56(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r10,%r12
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	movq	%r11,%r13
+	sbbq	24(%rbx),%r11
+	sbbq	$0,%rsi
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_redcx_mont_256:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_redcx_mont_256:
+.def	__mulx_by_1_mont_256;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_by_1_mont_256:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_mulx_mont_sparse_256
+.rva	.LSEH_body_mulx_mont_sparse_256
+.rva	.LSEH_info_mulx_mont_sparse_256_prologue
+
+.rva	.LSEH_body_mulx_mont_sparse_256
+.rva	.LSEH_epilogue_mulx_mont_sparse_256
+.rva	.LSEH_info_mulx_mont_sparse_256_body
+
+.rva	.LSEH_epilogue_mulx_mont_sparse_256
+.rva	.LSEH_end_mulx_mont_sparse_256
+.rva	.LSEH_info_mulx_mont_sparse_256_epilogue
+
+.rva	.LSEH_begin_sqrx_mont_sparse_256
+.rva	.LSEH_body_sqrx_mont_sparse_256
+.rva	.LSEH_info_sqrx_mont_sparse_256_prologue
+
+.rva	.LSEH_body_sqrx_mont_sparse_256
+.rva	.LSEH_epilogue_sqrx_mont_sparse_256
+.rva	.LSEH_info_sqrx_mont_sparse_256_body
+
+.rva	.LSEH_epilogue_sqrx_mont_sparse_256
+.rva	.LSEH_end_sqrx_mont_sparse_256
+.rva	.LSEH_info_sqrx_mont_sparse_256_epilogue
+
+.rva	.LSEH_begin_fromx_mont_256
+.rva	.LSEH_body_fromx_mont_256
+.rva	.LSEH_info_fromx_mont_256_prologue
+
+.rva	.LSEH_body_fromx_mont_256
+.rva	.LSEH_epilogue_fromx_mont_256
+.rva	.LSEH_info_fromx_mont_256_body
+
+.rva	.LSEH_epilogue_fromx_mont_256
+.rva	.LSEH_end_fromx_mont_256
+.rva	.LSEH_info_fromx_mont_256_epilogue
+
+.rva	.LSEH_begin_redcx_mont_256
+.rva	.LSEH_body_redcx_mont_256
+.rva	.LSEH_info_redcx_mont_256_prologue
+
+.rva	.LSEH_body_redcx_mont_256
+.rva	.LSEH_epilogue_redcx_mont_256
+.rva	.LSEH_info_redcx_mont_256_body
+
+.rva	.LSEH_epilogue_redcx_mont_256
+.rva	.LSEH_end_redcx_mont_256
+.rva	.LSEH_info_redcx_mont_256_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_mulx_mont_sparse_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mulx_mont_sparse_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_mulx_mont_sparse_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_mont_sparse_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_mont_sparse_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqrx_mont_sparse_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_fromx_mont_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_fromx_mont_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_fromx_mont_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_redcx_mont_256_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_redcx_mont_256_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_redcx_mont_256_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
new file mode 100644
index 00000000000..12306a7ff5c
--- /dev/null
+++ b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
@@ -0,0 +1,3559 @@
+.text	
+
+
+
+
+
+
+
+.def	__sub_mod_384x384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384x384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__add_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__add_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.def	__sub_mod_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sub_mod_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	mulx_mont_384x
+
+.def	mulx_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mulx_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mulx_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$328,%rsp
+
+.LSEH_body_mulx_mont_384x:
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulx_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	128+48(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	(%rbx),%rsi
+	leaq	-48(%rbx),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulx_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	leaq	(%rcx),%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_mulx_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mulx_mont_384x:
+.globl	sqrx_mont_384x
+
+.def	sqrx_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqrx_mont_384x:
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	movq	%rdx,%r8
+	adcq	%r12,%r12
+	movq	%r15,%r9
+	adcq	%rdi,%rdi
+	movq	%rax,%r10
+	adcq	%rbp,%rbp
+	movq	%r12,%r11
+	sbbq	%rsi,%rsi
+
+	subq	0(%rcx),%rdx
+	sbbq	8(%rcx),%r15
+	movq	%rdi,%r13
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r12
+	sbbq	32(%rcx),%rdi
+	movq	%rbp,%r14
+	sbbq	40(%rcx),%rbp
+	sbbq	$0,%rsi
+
+	cmovcq	%r8,%rdx
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%rdx,48(%rbx)
+	cmovcq	%r11,%r12
+	movq	%r15,56(%rbx)
+	cmovcq	%r13,%rdi
+	movq	%rax,64(%rbx)
+	cmovcq	%r14,%rbp
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqrx_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_mont_384x:
+
+.globl	mulx_382x
+
+.def	mulx_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+mulx_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mulx_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_mulx_382x:
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulx_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulx_384
+
+
+	leaq	48+128(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulx_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_mulx_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mulx_382x:
+.globl	sqrx_382x
+
+.def	sqrx_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rsi
+
+.LSEH_body_sqrx_382x:
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_382x:
+.globl	mulx_384
+
+.def	mulx_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mulx_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mulx_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+.LSEH_body_mulx_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulx_384
+
+	movq	0(%rsp),%r15
+
+	movq	8(%rsp),%r14
+
+	movq	16(%rsp),%r13
+
+	movq	24(%rsp),%r12
+
+	movq	32(%rsp),%rbx
+
+	movq	40(%rsp),%rbp
+
+	leaq	48(%rsp),%rsp
+
+.LSEH_epilogue_mulx_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mulx_384:
+
+.def	__mulx_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%r14,%r9,%rcx
+	xorq	%rbp,%rbp
+
+	mulxq	%r15,%r8,%rax
+	adcxq	%rcx,%r8
+	movq	%r9,0(%rdi)
+
+	mulxq	%r10,%r9,%rcx
+	adcxq	%rax,%r9
+
+	mulxq	%r11,%r10,%rax
+	adcxq	%rcx,%r10
+
+	mulxq	%r12,%r11,%rcx
+	adcxq	%rax,%r11
+
+	mulxq	%r13,%r12,%r13
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,8(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	16(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,16(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	24(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,24(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	32(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,32(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	40(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,40(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	%rax,%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	movq	%r10,64(%rdi)
+	movq	%r11,72(%rdi)
+	movq	%r12,80(%rdi)
+	movq	%r13,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+.globl	sqrx_384
+
+.def	sqrx_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	pushq	%rdi
+
+.LSEH_body_sqrx_384:
+
+
+	call	__sqrx_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_384:
+.def	__sqrx_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__sqrx_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rbx
+
+
+	mulxq	%r14,%r8,%rdi
+	movq	40(%rsi),%rbp
+	mulxq	%r15,%r9,%rax
+	addq	%rdi,%r9
+	mulxq	%rcx,%r10,%rdi
+	adcq	%rax,%r10
+	mulxq	%rbx,%r11,%rax
+	adcq	%rdi,%r11
+	mulxq	%rbp,%r12,%r13
+	movq	%r14,%rdx
+	adcq	%rax,%r12
+	adcq	$0,%r13
+
+
+	xorq	%r14,%r14
+	mulxq	%r15,%rdi,%rax
+	adcxq	%rdi,%r10
+	adoxq	%rax,%r11
+
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r11
+	adoxq	%rax,%r12
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%r15,%rdx
+	adcxq	%rdi,%r13
+	adoxq	%r14,%rax
+	adcxq	%rax,%r14
+
+
+	xorq	%r15,%r15
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r13
+	adoxq	%rax,%r14
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rcx,%rdx
+	adcxq	%rdi,%r14
+	adoxq	%r15,%rax
+	adcxq	%rax,%r15
+
+
+	xorq	%rcx,%rcx
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r14
+	adoxq	%rax,%r15
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rbx,%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rcx,%rax
+	adcxq	%rax,%rcx
+
+
+	mulxq	%rbp,%rdi,%rbx
+	movq	0(%rsi),%rdx
+	addq	%rdi,%rcx
+	movq	8(%rsp),%rdi
+	adcq	$0,%rbx
+
+
+	xorq	%rbp,%rbp
+	adcxq	%r8,%r8
+	adcxq	%r9,%r9
+	adcxq	%r10,%r10
+	adcxq	%r11,%r11
+	adcxq	%r12,%r12
+
+
+	mulxq	%rdx,%rdx,%rax
+	movq	%rdx,0(%rdi)
+	movq	8(%rsi),%rdx
+	adoxq	%rax,%r8
+	movq	%r8,8(%rdi)
+
+	mulxq	%rdx,%r8,%rax
+	movq	16(%rsi),%rdx
+	adoxq	%r8,%r9
+	adoxq	%rax,%r10
+	movq	%r9,16(%rdi)
+	movq	%r10,24(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	24(%rsi),%rdx
+	adoxq	%r8,%r11
+	adoxq	%r9,%r12
+	adcxq	%r13,%r13
+	adcxq	%r14,%r14
+	movq	%r11,32(%rdi)
+	movq	%r12,40(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	32(%rsi),%rdx
+	adoxq	%r8,%r13
+	adoxq	%r9,%r14
+	adcxq	%r15,%r15
+	adcxq	%rcx,%rcx
+	movq	%r13,48(%rdi)
+	movq	%r14,56(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	40(%rsi),%rdx
+	adoxq	%r8,%r15
+	adoxq	%r9,%rcx
+	adcxq	%rbx,%rbx
+	adcxq	%rbp,%rbp
+	movq	%r15,64(%rdi)
+	movq	%rcx,72(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	adoxq	%r8,%rbx
+	adoxq	%r9,%rbp
+
+	movq	%rbx,80(%rdi)
+	movq	%rbp,88(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+
+
+.globl	redcx_mont_384
+
+.def	redcx_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+redcx_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_redcx_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_redcx_mont_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_redcx_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_redcx_mont_384:
+
+
+
+
+.globl	fromx_mont_384
+
+.def	fromx_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+fromx_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_fromx_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_fromx_mont_384:
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+
+
+
+
+	movq	%r14,%rax
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_fromx_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_fromx_mont_384:
+.def	__mulx_by_1_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_by_1_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	%rcx,%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	imulq	%r8,%rdx
+
+
+	xorq	%r14,%r14
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r13
+	adoxq	%r14,%rbp
+	adcxq	%rbp,%r14
+	imulq	%r9,%rdx
+
+
+	xorq	%r15,%r15
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%r15,%rbp
+	adcxq	%rbp,%r15
+	imulq	%r10,%rdx
+
+
+	xorq	%r8,%r8
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r15
+	adoxq	%r8,%rbp
+	adcxq	%rbp,%r8
+	imulq	%r11,%rdx
+
+
+	xorq	%r9,%r9
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r8
+	adoxq	%r9,%rbp
+	adcxq	%rbp,%r9
+	imulq	%r12,%rdx
+
+
+	xorq	%r10,%r10
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r9
+	adoxq	%r10,%rbp
+	adcxq	%rbp,%r10
+	imulq	%r13,%rdx
+
+
+	xorq	%r11,%r11
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r10
+	adoxq	%r11,%rbp
+	adcxq	%rbp,%r11
+	.byte	0xf3,0xc3
+
+
+.def	__redc_tail_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__redc_tail_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+
+
+.globl	sgn0x_pty_mont_384
+
+.def	sgn0x_pty_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0x_pty_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0x_pty_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sgn0x_pty_mont_384:
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sgn0x_pty_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0x_pty_mont_384:
+
+.globl	sgn0x_pty_mont_384x
+
+.def	sgn0x_pty_mont_384x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sgn0x_pty_mont_384x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sgn0x_pty_mont_384x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$8,%rsp
+
+.LSEH_body_sgn0x_pty_mont_384x:
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+
+	movq	16(%rsp),%r14
+
+	movq	24(%rsp),%r13
+
+	movq	32(%rsp),%r12
+
+	movq	40(%rsp),%rbx
+
+	movq	48(%rsp),%rbp
+
+	leaq	56(%rsp),%rsp
+
+.LSEH_epilogue_sgn0x_pty_mont_384x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sgn0x_pty_mont_384x:
+.globl	mulx_mont_384
+
+.def	mulx_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+mulx_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_mulx_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	leaq	-24(%rsp),%rsp
+
+.LSEH_body_mulx_mont_384:
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+	movq	%r8,(%rsp)
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+
+	movq	32(%rsp),%r14
+
+	movq	40(%rsp),%r13
+
+	movq	48(%rsp),%r12
+
+	movq	56(%rsp),%rbx
+
+	movq	64(%rsp),%rbp
+
+	leaq	72(%rsp),%rsp
+
+.LSEH_epilogue_mulx_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_mulx_mont_384:
+.def	__mulx_mont_384;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,16(%rsp)
+	imulq	8(%rsp),%r8
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+	adoxq	%rax,%r15
+	adoxq	%rax,%rax
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%r8,%r14
+	adoxq	%r8,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r9,16(%rsp)
+	imulq	8(%rsp),%r9
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+	adoxq	%r8,%rax
+	adoxq	%r8,%r8
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r15
+	adoxq	%r9,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r10,16(%rsp)
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+	adoxq	%r9,%r8
+	adoxq	%r9,%r9
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r10,%rax
+	adoxq	%r10,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r11,16(%rsp)
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+	adoxq	%r10,%r9
+	adoxq	%r10,%r10
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r11,%r8
+	adoxq	%r11,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	movq	%r12,16(%rsp)
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+	adoxq	%r11,%r10
+	adoxq	%r11,%r11
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r9
+	adoxq	%r12,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	movq	%r15,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	movq	%rax,%rsi
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	movq	%r14,%rdx
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	leaq	128(%rcx),%rcx
+	movq	%r8,%r12
+	adcq	$0,%r11
+
+
+
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r9,%rdi
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r8
+	sbbq	32(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	40(%rcx),%r10
+	sbbq	$0,%r11
+
+	cmovncq	%r14,%rdx
+	cmovcq	%r13,%r15
+	cmovcq	%rsi,%rax
+	cmovncq	%r8,%r12
+	movq	%rdx,0(%rbx)
+	cmovncq	%r9,%rdi
+	movq	%r15,8(%rbx)
+	cmovncq	%r10,%rbp
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+
+	.byte	0xf3,0xc3
+
+
+.globl	sqrx_mont_384
+
+.def	sqrx_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	leaq	-24(%rsp),%rsp
+
+.LSEH_body_sqrx_mont_384:
+
+
+	movq	%rcx,%r8
+	leaq	-128(%rdx),%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	leaq	(%rsi),%rbx
+	movq	%r8,(%rsp)
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+
+	movq	32(%rsp),%r14
+
+	movq	40(%rsp),%r13
+
+	movq	48(%rsp),%r12
+
+	movq	56(%rsp),%rbx
+
+	movq	64(%rsp),%rbp
+
+	leaq	72(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_mont_384:
+
+.globl	sqrx_n_mul_mont_384
+
+.def	sqrx_n_mul_mont_384;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_n_mul_mont_384:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_n_mul_mont_384:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+	movq	48(%rsp),%r9
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	leaq	-40(%rsp),%rsp
+
+.LSEH_body_sqrx_n_mul_mont_384:
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+
+.Loop_sqrx_384:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_384
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+
+	movq	48(%rsp),%r14
+
+	movq	56(%rsp),%r13
+
+	movq	64(%rsp),%r12
+
+	movq	72(%rsp),%rbx
+
+	movq	80(%rsp),%rbp
+
+	leaq	88(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_n_mul_mont_384:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_n_mul_mont_384:
+
+.globl	sqrx_n_mul_mont_383
+
+.def	sqrx_n_mul_mont_383;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_n_mul_mont_383:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_n_mul_mont_383:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+	movq	40(%rsp),%r8
+	movq	48(%rsp),%r9
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	leaq	-40(%rsp),%rsp
+
+.LSEH_body_sqrx_n_mul_mont_383:
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+	leaq	-128(%rcx),%rcx
+
+.Loop_sqrx_383:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_383_nonred
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_383
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+
+	movq	48(%rsp),%r14
+
+	movq	56(%rsp),%r13
+
+	movq	64(%rsp),%r12
+
+	movq	72(%rsp),%rbx
+
+	movq	80(%rsp),%rbp
+
+	leaq	88(%rsp),%rsp
+
+.LSEH_epilogue_sqrx_n_mul_mont_383:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_n_mul_mont_383:
+.def	__mulx_mont_383_nonred;	.scl 3;	.type 32;	.endef
+.p2align	5
+__mulx_mont_383_nonred:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	movq	%r8,%rax
+	imulq	8(%rsp),%r8
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%r15,%rbp
+	adoxq	%rbp,%r15
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rax,%r14
+	adoxq	%rax,%r15
+	adcxq	%rax,%r15
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rax,%rbp
+	adoxq	%rbp,%rax
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%r8,%rbp
+	adoxq	%rbp,%r8
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%r9,%rbp
+	adoxq	%rbp,%r9
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%r10,%rbp
+	adoxq	%rbp,%r10
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r14,%rdx
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	adcq	$0,%r10
+	movq	%r8,%r12
+
+	movq	%r14,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r9,%rdi
+	movq	%r8,24(%rbx)
+	movq	%r9,32(%rbx)
+	movq	%r10,40(%rbx)
+	movq	%r10,%rbp
+
+	.byte	0xf3,0xc3
+
+
+.globl	sqrx_mont_382x
+
+.def	sqrx_mont_382x;	.scl 2;	.type 32;	.endef
+.p2align	5
+sqrx_mont_382x:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_sqrx_mont_382x:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	subq	$136,%rsp
+
+.LSEH_body_sqrx_mont_382x:
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	adcq	%r12,%r12
+	adcq	%rdi,%rdi
+	adcq	%rbp,%rbp
+
+	movq	%rdx,48(%rbx)
+	movq	%r15,56(%rbx)
+	movq	%rax,64(%rbx)
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32-128(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+
+
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	movq	32+96(%rsp),%r14
+	leaq	128(%rcx),%rcx
+	movq	32+0(%rsp),%r8
+	andq	%r14,%r8
+	movq	32+8(%rsp),%r9
+	andq	%r14,%r9
+	movq	32+16(%rsp),%r10
+	andq	%r14,%r10
+	movq	32+24(%rsp),%r11
+	andq	%r14,%r11
+	movq	32+32(%rsp),%r13
+	andq	%r14,%r13
+	andq	32+40(%rsp),%r14
+
+	subq	%r8,%rdx
+	movq	0(%rcx),%r8
+	sbbq	%r9,%r15
+	movq	8(%rcx),%r9
+	sbbq	%r10,%rax
+	movq	16(%rcx),%r10
+	sbbq	%r11,%r12
+	movq	24(%rcx),%r11
+	sbbq	%r13,%rdi
+	movq	32(%rcx),%r13
+	sbbq	%r14,%rbp
+	sbbq	%r14,%r14
+
+	andq	%r14,%r8
+	andq	%r14,%r9
+	andq	%r14,%r10
+	andq	%r14,%r11
+	andq	%r14,%r13
+	andq	40(%rcx),%r14
+
+	addq	%r8,%rdx
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%r12
+	adcq	%r13,%rdi
+	adcq	%r14,%rbp
+
+	movq	%rdx,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+
+	movq	8(%r8),%r14
+
+	movq	16(%r8),%r13
+
+	movq	24(%r8),%r12
+
+	movq	32(%r8),%rbx
+
+	movq	40(%r8),%rbp
+
+	leaq	48(%r8),%rsp
+
+.LSEH_epilogue_sqrx_mont_382x:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_sqrx_mont_382x:
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_mulx_mont_384x
+.rva	.LSEH_body_mulx_mont_384x
+.rva	.LSEH_info_mulx_mont_384x_prologue
+
+.rva	.LSEH_body_mulx_mont_384x
+.rva	.LSEH_epilogue_mulx_mont_384x
+.rva	.LSEH_info_mulx_mont_384x_body
+
+.rva	.LSEH_epilogue_mulx_mont_384x
+.rva	.LSEH_end_mulx_mont_384x
+.rva	.LSEH_info_mulx_mont_384x_epilogue
+
+.rva	.LSEH_begin_sqrx_mont_384x
+.rva	.LSEH_body_sqrx_mont_384x
+.rva	.LSEH_info_sqrx_mont_384x_prologue
+
+.rva	.LSEH_body_sqrx_mont_384x
+.rva	.LSEH_epilogue_sqrx_mont_384x
+.rva	.LSEH_info_sqrx_mont_384x_body
+
+.rva	.LSEH_epilogue_sqrx_mont_384x
+.rva	.LSEH_end_sqrx_mont_384x
+.rva	.LSEH_info_sqrx_mont_384x_epilogue
+
+.rva	.LSEH_begin_mulx_382x
+.rva	.LSEH_body_mulx_382x
+.rva	.LSEH_info_mulx_382x_prologue
+
+.rva	.LSEH_body_mulx_382x
+.rva	.LSEH_epilogue_mulx_382x
+.rva	.LSEH_info_mulx_382x_body
+
+.rva	.LSEH_epilogue_mulx_382x
+.rva	.LSEH_end_mulx_382x
+.rva	.LSEH_info_mulx_382x_epilogue
+
+.rva	.LSEH_begin_sqrx_382x
+.rva	.LSEH_body_sqrx_382x
+.rva	.LSEH_info_sqrx_382x_prologue
+
+.rva	.LSEH_body_sqrx_382x
+.rva	.LSEH_epilogue_sqrx_382x
+.rva	.LSEH_info_sqrx_382x_body
+
+.rva	.LSEH_epilogue_sqrx_382x
+.rva	.LSEH_end_sqrx_382x
+.rva	.LSEH_info_sqrx_382x_epilogue
+
+.rva	.LSEH_begin_mulx_384
+.rva	.LSEH_body_mulx_384
+.rva	.LSEH_info_mulx_384_prologue
+
+.rva	.LSEH_body_mulx_384
+.rva	.LSEH_epilogue_mulx_384
+.rva	.LSEH_info_mulx_384_body
+
+.rva	.LSEH_epilogue_mulx_384
+.rva	.LSEH_end_mulx_384
+.rva	.LSEH_info_mulx_384_epilogue
+
+.rva	.LSEH_begin_sqrx_384
+.rva	.LSEH_body_sqrx_384
+.rva	.LSEH_info_sqrx_384_prologue
+
+.rva	.LSEH_body_sqrx_384
+.rva	.LSEH_epilogue_sqrx_384
+.rva	.LSEH_info_sqrx_384_body
+
+.rva	.LSEH_epilogue_sqrx_384
+.rva	.LSEH_end_sqrx_384
+.rva	.LSEH_info_sqrx_384_epilogue
+
+.rva	.LSEH_begin_redcx_mont_384
+.rva	.LSEH_body_redcx_mont_384
+.rva	.LSEH_info_redcx_mont_384_prologue
+
+.rva	.LSEH_body_redcx_mont_384
+.rva	.LSEH_epilogue_redcx_mont_384
+.rva	.LSEH_info_redcx_mont_384_body
+
+.rva	.LSEH_epilogue_redcx_mont_384
+.rva	.LSEH_end_redcx_mont_384
+.rva	.LSEH_info_redcx_mont_384_epilogue
+
+.rva	.LSEH_begin_fromx_mont_384
+.rva	.LSEH_body_fromx_mont_384
+.rva	.LSEH_info_fromx_mont_384_prologue
+
+.rva	.LSEH_body_fromx_mont_384
+.rva	.LSEH_epilogue_fromx_mont_384
+.rva	.LSEH_info_fromx_mont_384_body
+
+.rva	.LSEH_epilogue_fromx_mont_384
+.rva	.LSEH_end_fromx_mont_384
+.rva	.LSEH_info_fromx_mont_384_epilogue
+
+.rva	.LSEH_begin_sgn0x_pty_mont_384
+.rva	.LSEH_body_sgn0x_pty_mont_384
+.rva	.LSEH_info_sgn0x_pty_mont_384_prologue
+
+.rva	.LSEH_body_sgn0x_pty_mont_384
+.rva	.LSEH_epilogue_sgn0x_pty_mont_384
+.rva	.LSEH_info_sgn0x_pty_mont_384_body
+
+.rva	.LSEH_epilogue_sgn0x_pty_mont_384
+.rva	.LSEH_end_sgn0x_pty_mont_384
+.rva	.LSEH_info_sgn0x_pty_mont_384_epilogue
+
+.rva	.LSEH_begin_sgn0x_pty_mont_384x
+.rva	.LSEH_body_sgn0x_pty_mont_384x
+.rva	.LSEH_info_sgn0x_pty_mont_384x_prologue
+
+.rva	.LSEH_body_sgn0x_pty_mont_384x
+.rva	.LSEH_epilogue_sgn0x_pty_mont_384x
+.rva	.LSEH_info_sgn0x_pty_mont_384x_body
+
+.rva	.LSEH_epilogue_sgn0x_pty_mont_384x
+.rva	.LSEH_end_sgn0x_pty_mont_384x
+.rva	.LSEH_info_sgn0x_pty_mont_384x_epilogue
+
+.rva	.LSEH_begin_mulx_mont_384
+.rva	.LSEH_body_mulx_mont_384
+.rva	.LSEH_info_mulx_mont_384_prologue
+
+.rva	.LSEH_body_mulx_mont_384
+.rva	.LSEH_epilogue_mulx_mont_384
+.rva	.LSEH_info_mulx_mont_384_body
+
+.rva	.LSEH_epilogue_mulx_mont_384
+.rva	.LSEH_end_mulx_mont_384
+.rva	.LSEH_info_mulx_mont_384_epilogue
+
+.rva	.LSEH_begin_sqrx_mont_384
+.rva	.LSEH_body_sqrx_mont_384
+.rva	.LSEH_info_sqrx_mont_384_prologue
+
+.rva	.LSEH_body_sqrx_mont_384
+.rva	.LSEH_epilogue_sqrx_mont_384
+.rva	.LSEH_info_sqrx_mont_384_body
+
+.rva	.LSEH_epilogue_sqrx_mont_384
+.rva	.LSEH_end_sqrx_mont_384
+.rva	.LSEH_info_sqrx_mont_384_epilogue
+
+.rva	.LSEH_begin_sqrx_n_mul_mont_384
+.rva	.LSEH_body_sqrx_n_mul_mont_384
+.rva	.LSEH_info_sqrx_n_mul_mont_384_prologue
+
+.rva	.LSEH_body_sqrx_n_mul_mont_384
+.rva	.LSEH_epilogue_sqrx_n_mul_mont_384
+.rva	.LSEH_info_sqrx_n_mul_mont_384_body
+
+.rva	.LSEH_epilogue_sqrx_n_mul_mont_384
+.rva	.LSEH_end_sqrx_n_mul_mont_384
+.rva	.LSEH_info_sqrx_n_mul_mont_384_epilogue
+
+.rva	.LSEH_begin_sqrx_n_mul_mont_383
+.rva	.LSEH_body_sqrx_n_mul_mont_383
+.rva	.LSEH_info_sqrx_n_mul_mont_383_prologue
+
+.rva	.LSEH_body_sqrx_n_mul_mont_383
+.rva	.LSEH_epilogue_sqrx_n_mul_mont_383
+.rva	.LSEH_info_sqrx_n_mul_mont_383_body
+
+.rva	.LSEH_epilogue_sqrx_n_mul_mont_383
+.rva	.LSEH_end_sqrx_n_mul_mont_383
+.rva	.LSEH_info_sqrx_n_mul_mont_383_epilogue
+
+.rva	.LSEH_begin_sqrx_mont_382x
+.rva	.LSEH_body_sqrx_mont_382x
+.rva	.LSEH_info_sqrx_mont_382x_prologue
+
+.rva	.LSEH_body_sqrx_mont_382x
+.rva	.LSEH_epilogue_sqrx_mont_382x
+.rva	.LSEH_info_sqrx_mont_382x_body
+
+.rva	.LSEH_epilogue_sqrx_mont_382x
+.rva	.LSEH_end_sqrx_mont_382x
+.rva	.LSEH_info_sqrx_mont_382x_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_mulx_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mulx_mont_384x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x29,0x00
+.byte	0x00,0xe4,0x2a,0x00
+.byte	0x00,0xd4,0x2b,0x00
+.byte	0x00,0xc4,0x2c,0x00
+.byte	0x00,0x34,0x2d,0x00
+.byte	0x00,0x54,0x2e,0x00
+.byte	0x00,0x74,0x30,0x00
+.byte	0x00,0x64,0x31,0x00
+.byte	0x00,0x01,0x2f,0x00
+.LSEH_info_mulx_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_mont_384x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqrx_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mulx_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mulx_382x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_mulx_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_382x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqrx_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mulx_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mulx_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x00,0x00
+.byte	0x00,0xe4,0x01,0x00
+.byte	0x00,0xd4,0x02,0x00
+.byte	0x00,0xc4,0x03,0x00
+.byte	0x00,0x34,0x04,0x00
+.byte	0x00,0x54,0x05,0x00
+.byte	0x00,0x74,0x07,0x00
+.byte	0x00,0x64,0x08,0x00
+.byte	0x00,0x52
+.byte	0x00,0x00
+.LSEH_info_mulx_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sqrx_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_redcx_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_redcx_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_redcx_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_fromx_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_fromx_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_fromx_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0x_pty_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0x_pty_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sgn0x_pty_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sgn0x_pty_mont_384x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sgn0x_pty_mont_384x_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x01,0x00
+.byte	0x00,0xe4,0x02,0x00
+.byte	0x00,0xd4,0x03,0x00
+.byte	0x00,0xc4,0x04,0x00
+.byte	0x00,0x34,0x05,0x00
+.byte	0x00,0x54,0x06,0x00
+.byte	0x00,0x74,0x08,0x00
+.byte	0x00,0x64,0x09,0x00
+.byte	0x00,0x62
+.byte	0x00,0x00
+.LSEH_info_sgn0x_pty_mont_384x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_mulx_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_mulx_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x03,0x00
+.byte	0x00,0xe4,0x04,0x00
+.byte	0x00,0xd4,0x05,0x00
+.byte	0x00,0xc4,0x06,0x00
+.byte	0x00,0x34,0x07,0x00
+.byte	0x00,0x54,0x08,0x00
+.byte	0x00,0x74,0x0a,0x00
+.byte	0x00,0x64,0x0b,0x00
+.byte	0x00,0x82
+.byte	0x00,0x00
+.LSEH_info_mulx_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x03,0x00
+.byte	0x00,0xe4,0x04,0x00
+.byte	0x00,0xd4,0x05,0x00
+.byte	0x00,0xc4,0x06,0x00
+.byte	0x00,0x34,0x07,0x00
+.byte	0x00,0x54,0x08,0x00
+.byte	0x00,0x74,0x0a,0x00
+.byte	0x00,0x64,0x0b,0x00
+.byte	0x00,0x82
+.byte	0x00,0x00
+.LSEH_info_sqrx_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_n_mul_mont_384_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_n_mul_mont_384_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x05,0x00
+.byte	0x00,0xe4,0x06,0x00
+.byte	0x00,0xd4,0x07,0x00
+.byte	0x00,0xc4,0x08,0x00
+.byte	0x00,0x34,0x09,0x00
+.byte	0x00,0x54,0x0a,0x00
+.byte	0x00,0x74,0x0c,0x00
+.byte	0x00,0x64,0x0d,0x00
+.byte	0x00,0xa2
+.byte	0x00,0x00
+.LSEH_info_sqrx_n_mul_mont_384_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_n_mul_mont_383_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_n_mul_mont_383_body:
+.byte	1,0,17,0
+.byte	0x00,0xf4,0x05,0x00
+.byte	0x00,0xe4,0x06,0x00
+.byte	0x00,0xd4,0x07,0x00
+.byte	0x00,0xc4,0x08,0x00
+.byte	0x00,0x34,0x09,0x00
+.byte	0x00,0x54,0x0a,0x00
+.byte	0x00,0x74,0x0c,0x00
+.byte	0x00,0x64,0x0d,0x00
+.byte	0x00,0xa2
+.byte	0x00,0x00
+.LSEH_info_sqrx_n_mul_mont_383_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_sqrx_mont_382x_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_sqrx_mont_382x_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x11,0x00
+.byte	0x00,0xe4,0x12,0x00
+.byte	0x00,0xd4,0x13,0x00
+.byte	0x00,0xc4,0x14,0x00
+.byte	0x00,0x34,0x15,0x00
+.byte	0x00,0x54,0x16,0x00
+.byte	0x00,0x74,0x18,0x00
+.byte	0x00,0x64,0x19,0x00
+.byte	0x00,0x01,0x17,0x00
+.LSEH_info_sqrx_mont_382x_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S
new file mode 100644
index 00000000000..a8bcbd3631b
--- /dev/null
+++ b/crypto/blst_src/build/coff/sha256-armv8.S
@@ -0,0 +1,1087 @@
+//
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+// project.
+// ====================================================================
+//
+// sha256_block procedure for ARMv8.
+//
+// This module is stripped of scalar code paths, with raionale that all
+// known processors are NEON-capable.
+//
+// See original module at CRYPTOGAMS for further details.
+
+.text
+
+.p2align	6
+
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.align	2
+.p2align	2
+.globl	blst_sha256_block_armv8
+.def	blst_sha256_block_armv8;
+.type	32;
+.endef
+.p2align	6
+blst_sha256_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.globl	blst_sha256_block_data_order
+.def	blst_sha256_block_data_order;
+.type	32;
+.endef
+.p2align	4
+blst_sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	x16,.LK256
+	add	x2,x1,x2,lsl#6	// len to point at the end of inp
+
+	ld1	{v0.16b},[x1], #16
+	ld1	{v1.16b},[x1], #16
+	ld1	{v2.16b},[x1], #16
+	ld1	{v3.16b},[x1], #16
+	ld1	{v4.4s},[x16], #16
+	ld1	{v5.4s},[x16], #16
+	ld1	{v6.4s},[x16], #16
+	ld1	{v7.4s},[x16], #16
+	rev32	v0.16b,v0.16b		// yes, even on
+	rev32	v1.16b,v1.16b		// big-endian
+	rev32	v2.16b,v2.16b
+	rev32	v3.16b,v3.16b
+	mov	x17,sp
+	add	v4.4s,v4.4s,v0.4s
+	add	v5.4s,v5.4s,v1.4s
+	add	v6.4s,v6.4s,v2.4s
+	st1	{v4.4s,v5.4s},[x17], #32
+	add	v7.4s,v7.4s,v3.4s
+	st1	{v6.4s,v7.4s},[x17]
+	sub	x17,x17,#32
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#8]
+	ldp	w7,w8,[x0,#16]
+	ldp	w9,w10,[x0,#24]
+	ldr	w12,[sp,#0]
+	mov	w13,wzr
+	eor	w14,w4,w5
+	mov	w15,wzr
+	b	.L_00_48
+
+.p2align	4
+.L_00_48:
+	ext	v4.16b,v0.16b,v1.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v2.16b,v3.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v3.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v0.4s,v0.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v0.4s,v0.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v0.4s,v0.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v0.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v0.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v0.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v0.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v0.4s,#13
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v0.4s,v0.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v0.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v1.16b,v2.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v3.16b,v0.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v0.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v1.4s,v1.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v1.4s,v1.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v1.4s,v1.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v1.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v1.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v1.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v1.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v1.4s,#13
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v1.4s,v1.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v1.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v2.16b,v3.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v0.16b,v1.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v1.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v2.4s,v2.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v2.4s,v2.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v2.4s,v2.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v2.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v2.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v2.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v2.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v2.4s,#13
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v2.4s,v2.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v2.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v3.16b,v0.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v1.16b,v2.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v2.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v3.4s,v3.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v3.4s,v3.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v3.4s,v3.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v3.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v3.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v3.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v3.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v3.4s,#13
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v3.4s,v3.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v3.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[x16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	cmp	w12,#0				// check for K256 terminator
+	ldr	w12,[sp,#0]
+	sub	x17,x17,#64
+	bne	.L_00_48
+
+	sub	x16,x16,#256		// rewind x16
+	cmp	x1,x2
+	mov	x17, #64
+	csel	x17, x17, xzr, eq
+	sub	x1,x1,x17			// avoid SEGV
+	mov	x17,sp
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v0.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v0.16b,v0.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v0.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v1.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v1.16b,v1.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v1.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v2.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v2.16b,v2.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v2.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v3.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v3.16b,v3.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v3.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w3,w3,w15			// h+=Sigma0(a) from the past
+	ldp	w11,w12,[x0,#0]
+	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
+	ldp	w13,w14,[x0,#8]
+	add	w3,w3,w11			// accumulate
+	add	w4,w4,w12
+	ldp	w11,w12,[x0,#16]
+	add	w5,w5,w13
+	add	w6,w6,w14
+	ldp	w13,w14,[x0,#24]
+	add	w7,w7,w11
+	add	w8,w8,w12
+	ldr	w12,[sp,#0]
+	stp	w3,w4,[x0,#0]
+	add	w9,w9,w13
+	mov	w13,wzr
+	stp	w5,w6,[x0,#8]
+	add	w10,w10,w14
+	stp	w7,w8,[x0,#16]
+	eor	w14,w4,w5
+	stp	w9,w10,[x0,#24]
+	mov	w15,wzr
+	mov	x17,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+
+.globl	blst_sha256_emit
+
+.def	blst_sha256_emit;
+.type	32;
+.endef
+.p2align	4
+blst_sha256_emit:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[x0,#4]
+	lsr	x4,x4,#32
+	str	w5,[x0,#12]
+	lsr	x5,x5,#32
+	str	w6,[x0,#20]
+	lsr	x6,x6,#32
+	str	w7,[x0,#28]
+	lsr	x7,x7,#32
+	str	w4,[x0,#0]
+	str	w5,[x0,#8]
+	str	w6,[x0,#16]
+	str	w7,[x0,#24]
+	ret
+
+
+.globl	blst_sha256_bcopy
+
+.def	blst_sha256_bcopy;
+.type	32;
+.endef
+.p2align	4
+blst_sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[x1],#1
+	sub	x2,x2,#1
+	strb	w3,[x0],#1
+	cbnz	x2,.Loop_bcopy
+	ret
+
+
+.globl	blst_sha256_hcopy
+
+.def	blst_sha256_hcopy;
+.type	32;
+.endef
+.p2align	4
+blst_sha256_hcopy:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+	ret
+
diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s
new file mode 100644
index 00000000000..e499d107c70
--- /dev/null
+++ b/crypto/blst_src/build/coff/sha256-portable-x86_64.s
@@ -0,0 +1,1784 @@
+.text	
+
+.globl	blst_sha256_block_data_order
+.def	blst_sha256_block_data_order;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_block_data_order:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_blst_sha256_block_data_order:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$64+24,%rsp
+
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+.LSEH_body_blst_sha256_block_data_order:
+
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.p2align	4
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	0(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	4(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	8(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	12(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	16(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	20(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	24(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	28(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	32(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	36(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	40(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	44(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	48(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	52(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	56(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	60(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	jmp	.Lrounds_16_xx
+.p2align	4
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	64(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	68(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	72(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	76(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	80(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	84(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	88(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	92(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	96(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	100(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	104(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	108(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	112(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	116(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	120(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	124(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	64(%rbp),%rbp
+	cmpb	$0x19,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	leaq	64+24+48(%rsp),%r11
+
+	movq	64+24(%rsp),%r15
+
+	movq	-40(%r11),%r14
+
+	movq	-32(%r11),%r13
+
+	movq	-24(%r11),%r12
+
+	movq	-16(%r11),%rbp
+
+	movq	-8(%r11),%rbx
+
+.LSEH_epilogue_blst_sha256_block_data_order:
+	mov	8(%r11),%rdi
+	mov	16(%r11),%rsi
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+
+.LSEH_end_blst_sha256_block_data_order:
+
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_emit
+
+.def	blst_sha256_emit;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_emit:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	bswapq	%r8
+	movq	24(%rdx),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rcx)
+	bswapq	%r10
+	movl	%r9d,12(%rcx)
+	bswapq	%r11
+	movl	%r10d,20(%rcx)
+	shrq	$32,%r8
+	movl	%r11d,28(%rcx)
+	shrq	$32,%r9
+	movl	%r8d,0(%rcx)
+	shrq	$32,%r10
+	movl	%r9d,8(%rcx)
+	shrq	$32,%r11
+	movl	%r10d,16(%rcx)
+	movl	%r11d,24(%rcx)
+	.byte	0xf3,0xc3
+
+
+.globl	blst_sha256_bcopy
+
+.def	blst_sha256_bcopy;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_bcopy:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rdx,%rcx
+.Loop_bcopy:
+	movzbl	(%rdx),%eax
+	leaq	1(%rdx),%rdx
+	movb	%al,-1(%rcx,%rdx,1)
+	decq	%r8
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+
+
+.globl	blst_sha256_hcopy
+
+.def	blst_sha256_hcopy;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_hcopy:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	%r8,0(%rcx)
+	movq	%r9,8(%rcx)
+	movq	%r10,16(%rcx)
+	movq	%r11,24(%rcx)
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_blst_sha256_block_data_order
+.rva	.LSEH_body_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_prologue
+
+.rva	.LSEH_body_blst_sha256_block_data_order
+.rva	.LSEH_epilogue_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_body
+
+.rva	.LSEH_epilogue_blst_sha256_block_data_order
+.rva	.LSEH_end_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_blst_sha256_block_data_order_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_blst_sha256_block_data_order_body:
+.byte	1,0,18,0
+.byte	0x00,0xf4,0x0b,0x00
+.byte	0x00,0xe4,0x0c,0x00
+.byte	0x00,0xd4,0x0d,0x00
+.byte	0x00,0xc4,0x0e,0x00
+.byte	0x00,0x54,0x0f,0x00
+.byte	0x00,0x34,0x10,0x00
+.byte	0x00,0x74,0x12,0x00
+.byte	0x00,0x64,0x13,0x00
+.byte	0x00,0x01,0x11,0x00
+.LSEH_info_blst_sha256_block_data_order_epilogue:
+.byte	1,0,5,11
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x03
+.byte	0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s
new file mode 100644
index 00000000000..ed28b781d4c
--- /dev/null
+++ b/crypto/blst_src/build/coff/sha256-x86_64.s
@@ -0,0 +1,1560 @@
+.text	
+
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_block_data_order_shaext
+
+.def	blst_sha256_block_data_order_shaext;	.scl 2;	.type 32;	.endef
+.p2align	6
+blst_sha256_block_data_order_shaext:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_blst_sha256_block_data_order_shaext:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	subq	$0x58,%rsp
+
+	movaps	%xmm6,-88(%r11)
+
+	movaps	%xmm7,-72(%r11)
+
+	movaps	%xmm8,-56(%r11)
+
+	movaps	%xmm9,-40(%r11)
+
+	movaps	%xmm10,-24(%r11)
+
+.LSEH_body_blst_sha256_block_data_order_shaext:
+
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	256-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.p2align	4
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	16-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	48-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	112-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	144-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	176-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	208-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	240-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	movaps	-88(%r11),%xmm6
+	movaps	-72(%r11),%xmm7
+	movaps	-56(%r11),%xmm8
+	movaps	-40(%r11),%xmm9
+	movaps	-24(%r11),%xmm10
+	movq	%r11,%rsp
+
+.LSEH_epilogue_blst_sha256_block_data_order_shaext:
+	mov	8(%r11),%rdi
+	mov	16(%r11),%rsi
+
+	.byte	0xf3,0xc3
+
+.LSEH_end_blst_sha256_block_data_order_shaext:
+.globl	blst_sha256_block_data_order
+
+.def	blst_sha256_block_data_order;	.scl 2;	.type 32;	.endef
+.p2align	6
+blst_sha256_block_data_order:
+	.byte	0xf3,0x0f,0x1e,0xfa
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	movq	%rsp,%r11
+.LSEH_begin_blst_sha256_block_data_order:
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+
+
+	pushq	%rbp
+
+	pushq	%rbx
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+	shlq	$4,%rdx
+	subq	$104,%rsp
+
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,0(%rsp)
+
+	movq	%rdx,16(%rsp)
+	movaps	%xmm6,32(%rsp)
+
+	movaps	%xmm7,48(%rsp)
+
+	movaps	%xmm8,64(%rsp)
+
+	movaps	%xmm9,80(%rsp)
+
+	movq	%rsp,%rbp
+
+.LSEH_body_blst_sha256_block_data_order:
+
+
+	leaq	-64(%rsp),%rsp
+	movl	0(%rdi),%eax
+	andq	$-64,%rsp
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.p2align	4
+.Lloop_ssse3:
+	movdqa	K256+256(%rip),%xmm7
+	movq	%rsi,8(%rbp)
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rsi
+.byte	102,15,56,0,207
+	movdqa	0(%rsi),%xmm4
+	movdqa	16(%rsi),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	32(%rsi),%xmm6
+.byte	102,15,56,0,223
+	movdqa	48(%rsi),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.p2align	4
+.Lssse3_00_47:
+	subq	$-64,%rsi
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	16(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	32(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	48(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,67(%rsi)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	0(%rbp),%rdi
+	movl	%r14d,%eax
+	movq	8(%rbp),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	leaq	64(%rsi),%rsi
+	cmpq	16(%rbp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0,%xmm0
+	leaq	104+48(%rbp),%r11
+
+	movaps	%xmm0,0(%rsp)
+	movaps	%xmm0,16(%rsp)
+	movaps	%xmm0,32(%rsp)
+	movaps	%xmm0,48(%rsp)
+	movaps	32(%rbp),%xmm6
+	movaps	48(%rbp),%xmm7
+	movaps	64(%rbp),%xmm8
+	movaps	80(%rbp),%xmm9
+	movq	104(%rbp),%r15
+
+	movq	-40(%r11),%r14
+
+	movq	-32(%r11),%r13
+
+	movq	-24(%r11),%r12
+
+	movq	-16(%r11),%rbx
+
+	movq	-8(%r11),%rbp
+
+.LSEH_epilogue_blst_sha256_block_data_order:
+	mov	8(%r11),%rdi
+	mov	16(%r11),%rsi
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+
+.LSEH_end_blst_sha256_block_data_order:
+.globl	blst_sha256_emit
+
+.def	blst_sha256_emit;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_emit:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	bswapq	%r8
+	movq	24(%rdx),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rcx)
+	bswapq	%r10
+	movl	%r9d,12(%rcx)
+	bswapq	%r11
+	movl	%r10d,20(%rcx)
+	shrq	$32,%r8
+	movl	%r11d,28(%rcx)
+	shrq	$32,%r9
+	movl	%r8d,0(%rcx)
+	shrq	$32,%r10
+	movl	%r9d,8(%rcx)
+	shrq	$32,%r11
+	movl	%r10d,16(%rcx)
+	movl	%r11d,24(%rcx)
+	.byte	0xf3,0xc3
+
+
+.globl	blst_sha256_bcopy
+
+.def	blst_sha256_bcopy;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_bcopy:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rdx,%rcx
+.Loop_bcopy:
+	movzbl	(%rdx),%eax
+	leaq	1(%rdx),%rdx
+	movb	%al,-1(%rcx,%rdx,1)
+	decq	%r8
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+
+
+.globl	blst_sha256_hcopy
+
+.def	blst_sha256_hcopy;	.scl 2;	.type 32;	.endef
+.p2align	4
+blst_sha256_hcopy:
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	%r8,0(%rcx)
+	movq	%r9,8(%rcx)
+	movq	%r10,16(%rcx)
+	movq	%r11,24(%rcx)
+	.byte	0xf3,0xc3
+
+.section	.pdata
+.p2align	2
+.rva	.LSEH_begin_blst_sha256_block_data_order_shaext
+.rva	.LSEH_body_blst_sha256_block_data_order_shaext
+.rva	.LSEH_info_blst_sha256_block_data_order_shaext_prologue
+
+.rva	.LSEH_body_blst_sha256_block_data_order_shaext
+.rva	.LSEH_epilogue_blst_sha256_block_data_order_shaext
+.rva	.LSEH_info_blst_sha256_block_data_order_shaext_body
+
+.rva	.LSEH_epilogue_blst_sha256_block_data_order_shaext
+.rva	.LSEH_end_blst_sha256_block_data_order_shaext
+.rva	.LSEH_info_blst_sha256_block_data_order_shaext_epilogue
+
+.rva	.LSEH_begin_blst_sha256_block_data_order
+.rva	.LSEH_body_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_prologue
+
+.rva	.LSEH_body_blst_sha256_block_data_order
+.rva	.LSEH_epilogue_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_body
+
+.rva	.LSEH_epilogue_blst_sha256_block_data_order
+.rva	.LSEH_end_blst_sha256_block_data_order
+.rva	.LSEH_info_blst_sha256_block_data_order_epilogue
+
+.section	.xdata
+.p2align	3
+.LSEH_info_blst_sha256_block_data_order_shaext_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_blst_sha256_block_data_order_shaext_body:
+.byte	1,0,15,0
+.byte	0x00,0x68,0x00,0x00
+.byte	0x00,0x78,0x01,0x00
+.byte	0x00,0x88,0x02,0x00
+.byte	0x00,0x98,0x03,0x00
+.byte	0x00,0xa8,0x04,0x00
+.byte	0x00,0x74,0x0c,0x00
+.byte	0x00,0x64,0x0d,0x00
+.byte	0x00,0xa2
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.LSEH_info_blst_sha256_block_data_order_shaext_epilogue:
+.byte	1,0,5,11
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x03
+.byte	0x00,0x00
+
+.LSEH_info_blst_sha256_block_data_order_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0x03
+.byte	0,0
+.LSEH_info_blst_sha256_block_data_order_body:
+.byte	1,0,26,5
+.byte	0x00,0x68,0x02,0x00
+.byte	0x00,0x78,0x03,0x00
+.byte	0x00,0x88,0x04,0x00
+.byte	0x00,0x98,0x05,0x00
+.byte	0x00,0xf4,0x0d,0x00
+.byte	0x00,0xe4,0x0e,0x00
+.byte	0x00,0xd4,0x0f,0x00
+.byte	0x00,0xc4,0x10,0x00
+.byte	0x00,0x34,0x11,0x00
+.byte	0x00,0x74,0x14,0x00
+.byte	0x00,0x64,0x15,0x00
+.byte	0x00,0x03
+.byte	0x00,0x01,0x12,0x00
+.byte	0x00,0x50
+.LSEH_info_blst_sha256_block_data_order_epilogue:
+.byte	1,0,5,11
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x03
+.byte	0x00,0x00
+
diff --git a/crypto/blst_src/build/elf/add_mod_256-armv8.S b/crypto/blst_src/build/elf/add_mod_256-armv8.S
new file mode 100644
index 00000000000..57476aaa1da
--- /dev/null
+++ b/crypto/blst_src/build/elf/add_mod_256-armv8.S
@@ -0,0 +1,379 @@
+.text
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,%function
+.align	5
+add_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	add_mod_256,.-add_mod_256
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,%function
+.align	5
+mul_by_3_mod_256:
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,%function
+.align	5
+lshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_lshift_mod_256:
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x12,lo
+	csel	x9,x9,x13,lo
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+
+	cbnz	x2,.Loop_lshift_mod_256
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	lshift_mod_256,.-lshift_mod_256
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,%function
+.align	5
+rshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+.Loop_rshift:
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	csel	x12,x12,x8,ne
+	csel	x13,x13,x9,ne
+	csel	x14,x14,x10,ne
+	csel	x15,x15,x11,ne
+	csel	x3,x3,xzr,ne
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,.Loop_rshift
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	rshift_mod_256,.-rshift_mod_256
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,%function
+.align	5
+cneg_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x8,x8,x12,eq
+	csel	x9,x9,x13,eq
+	csel	x10,x10,x14,eq
+	stp	x8,x9,[x0]
+	csel	x11,x11,x15,eq
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	cneg_mod_256,.-cneg_mod_256
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,%function
+.align	5
+sub_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+.size	sub_mod_256,.-sub_mod_256
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,%function
+.align	5
+check_mod_256:
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,x1
+
+	ret
+.size	check_mod_256,.-check_mod_256
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,%function
+.align	5
+add_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,%function
+.align	5
+sub_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
diff --git a/crypto/blst_src/build/elf/add_mod_256-x86_64.s b/crypto/blst_src/build/elf/add_mod_256-x86_64.s
new file mode 100644
index 00000000000..2f41781959c
--- /dev/null
+++ b/crypto/blst_src/build/elf/add_mod_256-x86_64.s
@@ -0,0 +1,572 @@
+.text	
+
+.globl	add_mod_256
+.hidden	add_mod_256
+.type	add_mod_256,@function
+.align	32
+add_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loaded_a_add_mod_256:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_256,.-add_mod_256
+
+
+.globl	mul_by_3_mod_256
+.hidden	mul_by_3_mod_256
+.type	mul_by_3_mod_256,@function
+.align	32
+mul_by_3_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rcx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rsi,%rdx
+	movq	24(%rsi),%r11
+
+	call	__lshift_mod_256
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	.Loaded_a_add_mod_256
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_256,.-mul_by_3_mod_256
+
+.type	__lshift_mod_256,@function
+.align	32
+__lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	movq	%r9,%rsi
+	adcq	%r11,%r11
+	sbbq	%r12,%r12
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	cmovcq	%rbx,%r10
+	cmovcq	%rbp,%r11
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__lshift_mod_256,.-__lshift_mod_256
+
+
+.globl	lshift_mod_256
+.hidden	lshift_mod_256
+.type	lshift_mod_256,@function
+.align	32
+lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_lshift_mod_256:
+	call	__lshift_mod_256
+	decl	%edx
+	jnz	.Loop_lshift_mod_256
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	lshift_mod_256,.-lshift_mod_256
+
+
+.globl	rshift_mod_256
+.hidden	rshift_mod_256
+.type	rshift_mod_256,@function
+.align	32
+rshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rbp
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+.Loop_rshift_mod_256:
+	movq	%rbp,%r8
+	andq	$1,%rbp
+	movq	0(%rcx),%rax
+	negq	%rbp
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+
+	andq	%rbp,%rax
+	andq	%rbp,%rsi
+	andq	%rbp,%rbx
+	andq	24(%rcx),%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	adcq	%rbx,%r10
+	adcq	%rbp,%r11
+	sbbq	%rax,%rax
+
+	shrq	$1,%r8
+	movq	%r9,%rbp
+	shrq	$1,%r9
+	movq	%r10,%rbx
+	shrq	$1,%r10
+	movq	%r11,%rsi
+	shrq	$1,%r11
+
+	shlq	$63,%rbp
+	shlq	$63,%rbx
+	orq	%r8,%rbp
+	shlq	$63,%rsi
+	orq	%rbx,%r9
+	shlq	$63,%rax
+	orq	%rsi,%r10
+	orq	%rax,%r11
+
+	decl	%edx
+	jnz	.Loop_rshift_mod_256
+
+	movq	%rbp,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rshift_mod_256,.-rshift_mod_256
+
+
+.globl	cneg_mod_256
+.hidden	cneg_mod_256
+.type	cneg_mod_256,@function
+.align	32
+cneg_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r12,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%r12
+	orq	%r10,%r12
+	orq	%r11,%r12
+	movq	$-1,%rbp
+
+	movq	0(%rcx),%rax
+	cmovnzq	%rbp,%r12
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+	andq	%r12,%rax
+	movq	24(%rcx),%rbp
+	andq	%r12,%rsi
+	andq	%r12,%rbx
+	andq	%r12,%rbp
+
+	subq	%r8,%rax
+	sbbq	%r9,%rsi
+	sbbq	%r10,%rbx
+	sbbq	%r11,%rbp
+
+	orq	%rdx,%rdx
+
+	cmovzq	%r8,%rax
+	cmovzq	%r9,%rsi
+	movq	%rax,0(%rdi)
+	cmovzq	%r10,%rbx
+	movq	%rsi,8(%rdi)
+	cmovzq	%r11,%rbp
+	movq	%rbx,16(%rdi)
+	movq	%rbp,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	cneg_mod_256,.-cneg_mod_256
+
+
+.globl	sub_mod_256
+.hidden	sub_mod_256
+.type	sub_mod_256,@function
+.align	32
+sub_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_256,.-sub_mod_256
+
+
+.globl	check_mod_256
+.hidden	check_mod_256
+.type	check_mod_256,@function
+.align	32
+check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+	movq	%rax,%r8
+	orq	%r9,%rax
+	orq	%r10,%rax
+	orq	%r11,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+	movq	$1,%rdx
+	cmpq	$0,%rax
+	cmovneq	%rdx,%rax
+	andq	%rsi,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	check_mod_256,.-check_mod_256
+
+
+.globl	add_n_check_mod_256
+.hidden	add_n_check_mod_256
+.type	add_n_check_mod_256,@function
+.align	32
+add_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_n_check_mod_256,.-add_n_check_mod_256
+
+
+.globl	sub_n_check_mod_256
+.hidden	sub_n_check_mod_256
+.type	sub_n_check_mod_256,@function
+.align	32
+sub_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_n_check_mod_256,.-sub_n_check_mod_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/add_mod_384-armv8.S b/crypto/blst_src/build/elf/add_mod_384-armv8.S
new file mode 100644
index 00000000000..5c18d7fe892
--- /dev/null
+++ b/crypto/blst_src/build/elf/add_mod_384-armv8.S
@@ -0,0 +1,1000 @@
+.text
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,%function
+.align	5
+add_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,%function
+.align	5
+add_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384x,.-add_mod_384x
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,%function
+.align	5
+rshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_rshift_mod_384:
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,.Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,%function
+.align	5
+__rshift_mod_384:
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,%function
+.align	5
+div_by_2_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,%function
+.align	5
+lshift_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+.Loop_lshift_mod_384:
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,.Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,%function
+.align	5
+__lshift_mod_384:
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,%function
+.align	5
+mul_by_3_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,%function
+.align	5
+mul_by_8_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,%function
+.align	5
+mul_by_3_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,%function
+.align	5
+mul_by_8_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,%function
+.align	5
+cneg_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x10,x10,x16,eq
+	csel	x11,x11,x17,eq
+	csel	x12,x12,x19,eq
+	csel	x13,x13,x20,eq
+	stp	x10,x11,[x0]
+	csel	x14,x14,x21,eq
+	stp	x12,x13,[x0,#16]
+	csel	x15,x15,x22,eq
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	cneg_mod_384,.-cneg_mod_384
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,%function
+.align	5
+sub_mod_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,%function
+.align	5
+sub_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384x,.-sub_mod_384x
+
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,%function
+.align	5
+mul_by_1_plus_i_mod_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,%function
+.align	5
+sgn0_pty_mod_384:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,%function
+.align	5
+sgn0_pty_mod_384x:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+.globl	vec_select_32
+.hidden	vec_select_32
+.type	vec_select_32,%function
+.align	5
+vec_select_32:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_32,.-vec_select_32
+.globl	vec_select_48
+.hidden	vec_select_48
+.type	vec_select_48,%function
+.align	5
+vec_select_48:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_48,.-vec_select_48
+.globl	vec_select_96
+.hidden	vec_select_96
+.type	vec_select_96,%function
+.align	5
+vec_select_96:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_96,.-vec_select_96
+.globl	vec_select_192
+.hidden	vec_select_192
+.type	vec_select_192,%function
+.align	5
+vec_select_192:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_192,.-vec_select_192
+.globl	vec_select_144
+.hidden	vec_select_144
+.type	vec_select_144,%function
+.align	5
+vec_select_144:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+.size	vec_select_144,.-vec_select_144
+.globl	vec_select_288
+.hidden	vec_select_288
+.type	vec_select_288,%function
+.align	5
+vec_select_288:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+.size	vec_select_288,.-vec_select_288
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,%function
+.align	5
+vec_prefetch:
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	prfm	pldl1keep, [x0]
+	ret
+.size	vec_prefetch,.-vec_prefetch
+.globl	vec_is_zero_16x
+.hidden	vec_is_zero_16x
+.type	vec_is_zero_16x,%function
+.align	5
+vec_is_zero_16x:
+	ld1	{v0.2d}, [x0], #16
+	lsr	x1, x1, #4
+	sub	x1, x1, #1
+	cbz	x1, .Loop_is_zero_done
+
+.Loop_is_zero:
+	ld1	{v1.2d}, [x0], #16
+	orr	v0.16b, v0.16b, v1.16b
+	sub	x1, x1, #1
+	cbnz	x1, .Loop_is_zero
+
+.Loop_is_zero_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+.size	vec_is_zero_16x,.-vec_is_zero_16x
+.globl	vec_is_equal_16x
+.hidden	vec_is_equal_16x
+.type	vec_is_equal_16x,%function
+.align	5
+vec_is_equal_16x:
+	ld1	{v0.2d}, [x0], #16
+	ld1	{v1.2d}, [x1], #16
+	lsr	x2, x2, #4
+	eor	v0.16b, v0.16b, v1.16b
+
+.Loop_is_equal:
+	sub	x2, x2, #1
+	cbz	x2, .Loop_is_equal_done
+	ld1	{v1.2d}, [x0], #16
+	ld1	{v2.2d}, [x1], #16
+	eor	v1.16b, v1.16b, v2.16b
+	orr	v0.16b, v0.16b, v1.16b
+	b	.Loop_is_equal
+	nop
+
+.Loop_is_equal_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+.size	vec_is_equal_16x,.-vec_is_equal_16x
diff --git a/crypto/blst_src/build/elf/add_mod_384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384-x86_64.s
new file mode 100644
index 00000000000..39eee6d1752
--- /dev/null
+++ b/crypto/blst_src/build/elf/add_mod_384-x86_64.s
@@ -0,0 +1,1907 @@
+.text	
+
+.globl	add_mod_384
+.hidden	add_mod_384
+.type	add_mod_384,@function
+.align	32
+add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384,.-add_mod_384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__add_mod_384_a_is_loaded:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.globl	add_mod_384x
+.hidden	add_mod_384x
+.type	add_mod_384x,@function
+.align	32
+add_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384x,.-add_mod_384x
+
+
+.globl	rshift_mod_384
+.hidden	rshift_mod_384
+.type	rshift_mod_384,@function
+.align	32
+rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_rshift_mod_384:
+	call	__rshift_mod_384
+	decl	%edx
+	jnz	.Loop_rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rshift_mod_384,.-rshift_mod_384
+
+.type	__rshift_mod_384,@function
+.align	32
+__rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rsi
+	movq	0(%rcx),%r14
+	andq	%r8,%rsi
+	movq	8(%rcx),%r15
+	negq	%rsi
+	movq	16(%rcx),%rax
+	andq	%rsi,%r14
+	movq	24(%rcx),%rbx
+	andq	%rsi,%r15
+	movq	32(%rcx),%rbp
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%rbx
+	adcq	%r12,%rbp
+	adcq	%r13,%rsi
+	sbbq	%r13,%r13
+
+	shrq	$1,%r14
+	movq	%r15,%r8
+	shrq	$1,%r15
+	movq	%rax,%r9
+	shrq	$1,%rax
+	movq	%rbx,%r10
+	shrq	$1,%rbx
+	movq	%rbp,%r11
+	shrq	$1,%rbp
+	movq	%rsi,%r12
+	shrq	$1,%rsi
+	shlq	$63,%r8
+	shlq	$63,%r9
+	orq	%r14,%r8
+	shlq	$63,%r10
+	orq	%r15,%r9
+	shlq	$63,%r11
+	orq	%rax,%r10
+	shlq	$63,%r12
+	orq	%rbx,%r11
+	shlq	$63,%r13
+	orq	%rbp,%r12
+	orq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__rshift_mod_384,.-__rshift_mod_384
+
+.globl	div_by_2_mod_384
+.hidden	div_by_2_mod_384
+.type	div_by_2_mod_384,@function
+.align	32
+div_by_2_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	%rdx,%rcx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	call	__rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	div_by_2_mod_384,.-div_by_2_mod_384
+
+
+.globl	lshift_mod_384
+.hidden	lshift_mod_384
+.type	lshift_mod_384,@function
+.align	32
+lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+.Loop_lshift_mod_384:
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdi,%rdi
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdi
+
+	movq	(%rsp),%rdi
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	decl	%edx
+	jnz	.Loop_lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	lshift_mod_384,.-lshift_mod_384
+
+.type	__lshift_mod_384,@function
+.align	32
+__lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__lshift_mod_384,.-__lshift_mod_384
+
+
+.globl	mul_by_3_mod_384
+.hidden	mul_by_3_mod_384
+.type	mul_by_3_mod_384,@function
+.align	32
+mul_by_3_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_384,.-mul_by_3_mod_384
+
+.globl	mul_by_8_mod_384
+.hidden	mul_by_8_mod_384
+.type	mul_by_8_mod_384,@function
+.align	32
+mul_by_8_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_8_mod_384,.-mul_by_8_mod_384
+
+
+.globl	mul_by_3_mod_384x
+.hidden	mul_by_3_mod_384x
+.type	mul_by_3_mod_384x,@function
+.align	32
+mul_by_3_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	(%rsp),%rsi
+	leaq	48(%rdi),%rdi
+
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	movq	72(%rsi),%r11
+	movq	80(%rsi),%r12
+	movq	88(%rsi),%r13
+
+	call	__lshift_mod_384
+
+	movq	$48,%rdx
+	addq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
+
+.globl	mul_by_8_mod_384x
+.hidden	mul_by_8_mod_384x
+.type	mul_by_8_mod_384x,@function
+.align	32
+mul_by_8_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rsi
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,48+0(%rdi)
+	movq	%r9,48+8(%rdi)
+	movq	%r10,48+16(%rdi)
+	movq	%r11,48+24(%rdi)
+	movq	%r12,48+32(%rdi)
+	movq	%r13,48+40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
+
+
+.globl	cneg_mod_384
+.hidden	cneg_mod_384
+.type	cneg_mod_384,@function
+.align	32
+cneg_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdx
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rdx,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%rdx
+	movq	32(%rsi),%r12
+	orq	%r10,%rdx
+	movq	40(%rsi),%r13
+	orq	%r11,%rdx
+	movq	$-1,%rsi
+	orq	%r12,%rdx
+	orq	%r13,%rdx
+
+	movq	0(%rcx),%r14
+	cmovnzq	%rsi,%rdx
+	movq	8(%rcx),%r15
+	movq	16(%rcx),%rax
+	andq	%rdx,%r14
+	movq	24(%rcx),%rbx
+	andq	%rdx,%r15
+	movq	32(%rcx),%rbp
+	andq	%rdx,%rax
+	movq	40(%rcx),%rsi
+	andq	%rdx,%rbx
+	movq	0(%rsp),%rcx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	subq	%r8,%r14
+	sbbq	%r9,%r15
+	sbbq	%r10,%rax
+	sbbq	%r11,%rbx
+	sbbq	%r12,%rbp
+	sbbq	%r13,%rsi
+
+	orq	%rcx,%rcx
+
+	cmovzq	%r8,%r14
+	cmovzq	%r9,%r15
+	cmovzq	%r10,%rax
+	movq	%r14,0(%rdi)
+	cmovzq	%r11,%rbx
+	movq	%r15,8(%rdi)
+	cmovzq	%r12,%rbp
+	movq	%rax,16(%rdi)
+	cmovzq	%r13,%rsi
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rsi,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	cneg_mod_384,.-cneg_mod_384
+
+
+.globl	sub_mod_384
+.hidden	sub_mod_384
+.type	sub_mod_384,@function
+.align	32
+sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384,.-sub_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	sub_mod_384x
+.hidden	sub_mod_384x
+.type	sub_mod_384x,@function
+.align	32
+sub_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384x,.-sub_mod_384x
+.globl	mul_by_1_plus_i_mod_384x
+.hidden	mul_by_1_plus_i_mod_384x
+.type	mul_by_1_plus_i_mod_384x,@function
+.align	32
+mul_by_1_plus_i_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$56,%rsp
+.cfi_adjust_cfa_offset	56
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rbx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rcx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+	movq	%rdi,48(%rsp)
+	sbbq	%rdi,%rdi
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rbx
+	sbbq	80(%rsi),%rcx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rsi,%rsi
+
+	movq	%r8,0(%rsp)
+	movq	0(%rdx),%r8
+	movq	%r9,8(%rsp)
+	movq	8(%rdx),%r9
+	movq	%r10,16(%rsp)
+	movq	16(%rdx),%r10
+	movq	%r11,24(%rsp)
+	movq	24(%rdx),%r11
+	movq	%r12,32(%rsp)
+	andq	%rsi,%r8
+	movq	32(%rdx),%r12
+	movq	%r13,40(%rsp)
+	andq	%rsi,%r9
+	movq	40(%rdx),%r13
+	andq	%rsi,%r10
+	andq	%rsi,%r11
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	movq	48(%rsp),%rsi
+
+	addq	%r8,%r14
+	movq	0(%rsp),%r8
+	adcq	%r9,%r15
+	movq	8(%rsp),%r9
+	adcq	%r10,%rax
+	movq	16(%rsp),%r10
+	adcq	%r11,%rbx
+	movq	24(%rsp),%r11
+	adcq	%r12,%rcx
+	movq	32(%rsp),%r12
+	adcq	%r13,%rbp
+	movq	40(%rsp),%r13
+
+	movq	%r14,0(%rsi)
+	movq	%r8,%r14
+	movq	%r15,8(%rsi)
+	movq	%rax,16(%rsi)
+	movq	%r9,%r15
+	movq	%rbx,24(%rsi)
+	movq	%rcx,32(%rsi)
+	movq	%r10,%rax
+	movq	%rbp,40(%rsi)
+
+	subq	0(%rdx),%r8
+	movq	%r11,%rbx
+	sbbq	8(%rdx),%r9
+	sbbq	16(%rdx),%r10
+	movq	%r12,%rcx
+	sbbq	24(%rdx),%r11
+	sbbq	32(%rdx),%r12
+	movq	%r13,%rbp
+	sbbq	40(%rdx),%r13
+	sbbq	$0,%rdi
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,48(%rsi)
+	cmovcq	%rbx,%r11
+	movq	%r9,56(%rsi)
+	cmovcq	%rcx,%r12
+	movq	%r10,64(%rsi)
+	cmovcq	%rbp,%r13
+	movq	%r11,72(%rsi)
+	movq	%r12,80(%rsi)
+	movq	%r13,88(%rsi)
+
+	movq	56+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	56+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	56+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	56+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	56+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	56+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
+.globl	sgn0_pty_mod_384
+.hidden	sgn0_pty_mod_384
+.type	sgn0_pty_mod_384,@function
+.align	32
+sgn0_pty_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%rcx
+	movq	40(%rdi),%rdx
+
+	xorq	%rax,%rax
+	movq	%r8,%rdi
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%rdi
+	andq	$2,%rax
+	orq	%rdi,%rax
+
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
+
+.globl	sgn0_pty_mod_384x
+.hidden	sgn0_pty_mod_384x
+.type	sgn0_pty_mod_384x,@function
+.align	32
+sgn0_pty_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	48(%rdi),%r8
+	movq	56(%rdi),%r9
+	movq	64(%rdi),%r10
+	movq	72(%rdi),%r11
+	movq	80(%rdi),%rcx
+	movq	88(%rdi),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	leaq	0(%rdi),%rax
+	xorq	%rdi,%rdi
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rdi
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rdi
+
+	movq	%r8,0(%rsp)
+	notq	%rdi
+	andq	$1,%rbp
+	andq	$2,%rdi
+	orq	%rbp,%rdi
+
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%rcx
+	movq	40(%rax),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	xorq	%rax,%rax
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%rbx
+
+	notq	%rax
+
+	testq	%r8,%r8
+	cmovzq	%rdi,%rbp
+
+	testq	%rbx,%rbx
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%rbp
+	andq	$2,%rax
+	orq	%rbp,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
+.globl	vec_select_32
+.hidden	vec_select_32
+.type	vec_select_32,@function
+.align	32
+vec_select_32:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	16(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	16(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-16(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-16(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-16(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_32,.-vec_select_32
+.globl	vec_select_48
+.hidden	vec_select_48
+.type	vec_select_48,@function
+.align	32
+vec_select_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	24(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	24(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	24(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-24(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-24(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-24(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-24(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-24(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-24(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_48,.-vec_select_48
+.globl	vec_select_96
+.hidden	vec_select_96
+.type	vec_select_96,@function
+.align	32
+vec_select_96:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	48(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	48(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	48(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-48(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-48(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_96,.-vec_select_96
+.globl	vec_select_192
+.hidden	vec_select_192
+.type	vec_select_192,@function
+.align	32
+vec_select_192:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	96(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	96(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	96(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-96(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-96(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_192,.-vec_select_192
+.globl	vec_select_144
+.hidden	vec_select_144
+.type	vec_select_144,@function
+.align	32
+vec_select_144:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	72(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	72(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	72(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-72(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-72(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_144,.-vec_select_144
+.globl	vec_select_288
+.hidden	vec_select_288
+.type	vec_select_288,@function
+.align	32
+vec_select_288:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	144(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	144(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	144(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	176+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	176+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	192+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	192+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,192-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	208+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	208+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,208-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	224+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	224+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,224-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	240+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	240+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,240-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	256+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	256+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,256-144(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,272-144(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_select_288,.-vec_select_288
+.globl	vec_prefetch
+.hidden	vec_prefetch
+.type	vec_prefetch,@function
+.align	32
+vec_prefetch:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	leaq	-1(%rdi,%rsi,1),%rsi
+	movq	$64,%rax
+	xorq	%r8,%r8
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	prefetchnta	(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_prefetch,.-vec_prefetch
+.globl	vec_is_zero_16x
+.hidden	vec_is_zero_16x
+.type	vec_is_zero_16x,@function
+.align	32
+vec_is_zero_16x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%esi
+	movdqu	(%rdi),%xmm0
+	leaq	16(%rdi),%rdi
+
+.Loop_is_zero:
+	decl	%esi
+	jz	.Loop_is_zero_done
+	movdqu	(%rdi),%xmm1
+	leaq	16(%rdi),%rdi
+	por	%xmm1,%xmm0
+	jmp	.Loop_is_zero
+
+.Loop_is_zero_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%esi
+	testq	%rax,%rax
+	cmovnzl	%esi,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_is_zero_16x,.-vec_is_zero_16x
+.globl	vec_is_equal_16x
+.hidden	vec_is_equal_16x
+.type	vec_is_equal_16x,@function
+.align	32
+vec_is_equal_16x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%edx
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm1
+	subq	%rdi,%rsi
+	leaq	16(%rdi),%rdi
+	pxor	%xmm1,%xmm0
+
+.Loop_is_equal:
+	decl	%edx
+	jz	.Loop_is_equal_done
+	movdqu	(%rdi),%xmm1
+	movdqu	(%rdi,%rsi,1),%xmm2
+	leaq	16(%rdi),%rdi
+	pxor	%xmm2,%xmm1
+	por	%xmm1,%xmm0
+	jmp	.Loop_is_equal
+
+.Loop_is_equal_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%edx
+	testq	%rax,%rax
+	cmovnzl	%edx,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	vec_is_equal_16x,.-vec_is_equal_16x
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s
new file mode 100644
index 00000000000..084f3d8262d
--- /dev/null
+++ b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s
@@ -0,0 +1,252 @@
+.text	
+
+.type	__add_mod_384x384,@function
+.align	32
+__add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	addq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	adcq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	adcq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	adcq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	adcq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	adcq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	adcq	48(%rdx),%r14
+	movq	%r9,8(%rdi)
+	adcq	56(%rdx),%r15
+	movq	%r10,16(%rdi)
+	adcq	64(%rdx),%rax
+	movq	%r12,32(%rdi)
+	movq	%r14,%r8
+	adcq	72(%rdx),%rbx
+	movq	%r11,24(%rdi)
+	movq	%r15,%r9
+	adcq	80(%rdx),%rbp
+	movq	%r13,40(%rdi)
+	movq	%rax,%r10
+	adcq	88(%rdx),%rsi
+	movq	%rbx,%r11
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%rbp,%r12
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%rbx
+	sbbq	32(%rcx),%rbp
+	movq	%rsi,%r13
+	sbbq	40(%rcx),%rsi
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r14
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%r14,48(%rdi)
+	cmovcq	%r11,%rbx
+	movq	%r15,56(%rdi)
+	cmovcq	%r12,%rbp
+	movq	%rax,64(%rdi)
+	cmovcq	%r13,%rsi
+	movq	%rbx,72(%rdi)
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.globl	add_mod_384x384
+.hidden	add_mod_384x384
+.type	add_mod_384x384,@function
+.align	32
+add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	add_mod_384x384,.-add_mod_384x384
+
+.globl	sub_mod_384x384
+.hidden	sub_mod_384x384
+.type	sub_mod_384x384,@function
+.align	32
+sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
new file mode 100644
index 00000000000..347eb315f40
--- /dev/null
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
@@ -0,0 +1,784 @@
+.text
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256, %function
+.align	5
+ct_inverse_mod_256:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	.Lab_approximation_31_256_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+
+////////////////////////////////////////////////////////////////////////
+.type	__smul_256x63, %function
+.align	5
+__smul_256x63:
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	csel	x22, x22, xzr, ne
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	csel	x23, x23, xzr, ne
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+.size	__smul_256x63,.-__smul_256x63
+
+.type	__smul_512x63_tail, %function
+.align	5
+__smul_512x63_tail:
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+.size	__smul_512x63_tail,.-__smul_512x63_tail
+
+.type	__smul_256_n_shift_by_31, %function
+.align	5
+__smul_256_n_shift_by_31:
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
+.type	__ab_approximation_31_256, %function
+.align	4
+__ab_approximation_31_256:
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+.Lab_approximation_31_256_loaded:
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x5, ne
+	orr	x19, x7, x11	// and ones before top-most, ...
+	csel	x10, x10, x9, ne
+
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x4, ne
+	orr	x19, x7, x11	// and one more, ...
+	csel	x10, x10, x8, ne
+
+	clz	x19, x19
+	cmp	x19, #64
+	csel	x19, x19, xzr, ne
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+
+.type	__inner_loop_31_256, %function
+.align	4
+__inner_loop_31_256:
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+.Loop_31_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
+	csel	x13, x13, x19,   hs
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, .Loop_31_256
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256, %function
+.align	4
+__inner_loop_62_256:
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+.Loop_62_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	x20, x13
+	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
+	csel	x14, x14, x19,     hs
+	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
+	csel	x15, x15, x20,     hs
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62_256
+
+	ret
+.size	__inner_loop_62_256,.-__inner_loop_62_256
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
new file mode 100644
index 00000000000..c4d8d6d3700
--- /dev/null
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
@@ -0,0 +1,1185 @@
+.text	
+
+.globl	ct_inverse_mod_256
+.type	ct_inverse_mod_256,@function
+.align	32
+ct_inverse_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1072,%rsp
+.cfi_adjust_cfa_offset	1072
+
+
+	leaq	48+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	movq	0(%rdx),%r12
+	movq	8(%rdx),%r13
+	movq	16(%rdx),%r14
+	movq	24(%rdx),%r15
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rax,%rsi
+
+
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,64(%rdi)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,72(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+
+	movq	64(%rsi),%r8
+	movq	104(%rsi),%r12
+	movq	%r8,%r9
+	imulq	0(%rsp),%r8
+	movq	%r12,%r13
+	imulq	8(%rsp),%r12
+	addq	%r12,%r8
+	movq	%r8,32(%rdi)
+	sarq	$63,%r8
+	movq	%r8,40(%rdi)
+	movq	%r8,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r8,64(%rdi)
+	leaq	64(%rsi),%rsi
+
+	imulq	%rdx,%r9
+	imulq	%rcx,%r13
+	addq	%r13,%r9
+	movq	%r9,72(%rdi)
+	sarq	$63,%r9
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	movq	%r9,96(%rdi)
+	movq	%r9,104(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	sarq	$63,%rbp
+	movq	%rbp,40(%rdi)
+	movq	%rbp,48(%rdi)
+	movq	%rbp,56(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+
+	xorq	$256+64,%rsi
+	movl	$47,%edx
+
+	movq	0(%rsi),%r8
+
+	movq	32(%rsi),%r10
+
+	call	__inner_loop_62_256
+
+
+
+
+
+
+
+	leaq	64(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_512x63
+	adcq	%rbp,%rdx
+
+	movq	40(%rsp),%rsi
+	movq	%rdx,%rax
+	sarq	$63,%rdx
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%rax
+
+	movq	%rax,%rdx
+	negq	%rax
+	orq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	xorq	%rax,%r8
+	xorq	%rcx,%rcx
+	xorq	%rax,%r9
+	subq	%rax,%rcx
+	xorq	%rax,%r10
+	xorq	%rax,%rdx
+	addq	%rcx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	1072(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1072-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_inverse_mod_256,.-ct_inverse_mod_256
+.type	__smulq_512x63,@function
+.align	32
+__smulq_512x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%r11,24(%rdi)
+
+	movq	40(%rsi),%r8
+	movq	48(%rsi),%r9
+	movq	56(%rsi),%r10
+	movq	64(%rsi),%r11
+	movq	72(%rsi),%r12
+	movq	80(%rsi),%r13
+	movq	88(%rsi),%r14
+	movq	96(%rsi),%r15
+
+	movq	%rcx,%rdx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rcx
+	addq	%rax,%rcx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rcx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rcx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rcx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rcx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rcx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	imulq	%rcx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	%rbx,%r12
+	adcq	%rbp,%r13
+	adcq	%rbp,%r14
+	adcq	%rbp,%r15
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_512x63,.-__smulq_512x63
+
+.type	__smulq_256x63,@function
+.align	32
+__smulq_256x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%rcx,%rdx
+	movq	40+0(%rsi),%r12
+	movq	40+8(%rsi),%r13
+	movq	40+16(%rsi),%r14
+	movq	40+24(%rsi),%r15
+	movq	40+32(%rsi),%rcx
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rcx
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rcx
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	andq	%rbx,%rcx
+	negq	%rcx
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%rbp,32(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_256x63,.-__smulq_256x63
+.type	__smulq_256_n_shift_by_31,@function
+.align	32
+__smulq_256_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,0(%rdi)
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rbp
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	andq	%rbx,%rbp
+	negq	%rbp
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r14
+	movq	32+24(%rsi),%r15
+
+	movq	%rcx,%rbx
+	sarq	$63,%rcx
+	xorq	%rax,%rax
+	subq	%rcx,%rax
+
+	xorq	%rcx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rcx,%r12
+	xorq	%rcx,%r13
+	xorq	%rcx,%r14
+	xorq	%rcx,%r15
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	andq	%rbx,%rcx
+	negq	%rcx
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	0(%rdi),%rdx
+	movq	8(%rdi),%rcx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%rbp,%r11
+
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	xorq	%rbp,%rdx
+	xorq	%rbp,%rcx
+	addq	%rax,%rdx
+	addq	%rax,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
+.type	__ab_approximation_31_256,@function
+.align	32
+__ab_approximation_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	24(%rsi),%r9
+	movq	56(%rsi),%r11
+	movq	16(%rsi),%rbx
+	movq	48(%rsi),%rbp
+	movq	8(%rsi),%r8
+	movq	40(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	32(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	notq	%rax
+	andq	%rax,%r9
+	andq	%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_31_256,.-__ab_approximation_31_256
+.type	__inner_loop_31_256,@function
+.align	32
+__inner_loop_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31_256:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edx
+	jnz	.Loop_31_256
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_31_256,.-__inner_loop_31_256
+
+.type	__inner_loop_62_256,@function
+.align	32
+__inner_loop_62_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	%edx,%r15d
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	%rdx,%r13
+	movq	%rdx,%r14
+
+.Loop_62_256:
+	xorq	%rax,%rax
+	testq	%r14,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	%r14,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%r15d
+	jnz	.Loop_62_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62_256,.-__inner_loop_62_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
new file mode 100644
index 00000000000..d7eca17073c
--- /dev/null
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
@@ -0,0 +1,717 @@
+.text
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383, %function
+.align	5
+ct_inverse_mod_383:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	.Lab_approximation_62_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+.type	__smul_383x63, %function
+.align	5
+__smul_383x63:
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+.size	__smul_383x63,.-__smul_383x63
+
+.type	__smul_767x63_tail, %function
+.align	5
+__smul_767x63_tail:
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+.size	__smul_767x63_tail,.-__smul_767x63_tail
+
+.type	__smul_383_n_shift_by_62, %function
+.align	5
+__smul_383_n_shift_by_62:
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
+.type	__ab_approximation_62, %function
+.align	4
+__ab_approximation_62:
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+.Lab_approximation_62_loaded:
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x22, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x22, x8, x14
+	csel	x13, x13, x10, ne
+
+	clz	x22, x22
+	cmp	x22, #64
+	csel	x22, x22, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+.size	__ab_approximation_62,.-__ab_approximation_62
+.type	__inner_loop_62, %function
+.align	4
+__inner_loop_62:
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+.Loop_62:
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x14, x14, x8, hs
+	csel	x3, x26, x24, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x8, x27, x25, hs
+	csel	x15, x15, x17,       hs	// exchange |f0| and |f1|
+	csel	x17, x17, x22,     hs
+	csel	x16, x16, x19,       hs	// exchange |g0| and |g1|
+	csel	x19, x19, x23,     hs
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, .Loop_62
+
+	ret
+.size	__inner_loop_62,.-__inner_loop_62
diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
new file mode 100644
index 00000000000..3f1390ed9dc
--- /dev/null
+++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
@@ -0,0 +1,324 @@
+.text
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384, %function
+.align	5
+ct_is_square_mod_384:
+	.inst	0xd503233f
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the .Legendre symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	.Loop_is_square
+
+.align	4
+.Loop_is_square:
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, .Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// and loaded
+	//ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	.inst	0xd50323bf
+	ret
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smul_384_n_shift_by_30, %function
+.align	5
+__smul_384_n_shift_by_30:
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
+.type	__ab_approximation_30, %function
+.align	4
+__ab_approximation_30:
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x21, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x21, x8, x14	// and one more, ...
+	csel	x13, x13, x10, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x3, ne
+	orr	x21, x8, x14
+	csel	x13, x13, x9, ne
+
+	clz	x21, x21
+	cmp	x21, #64
+	csel	x21, x21, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+.size	__ab_approximation_30,.-__ab_approximation_30
+
+.type	__inner_loop_30, %function
+.align	4
+__inner_loop_30:
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+.Loop_30:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x20, x20, x17,  hs	// exchange |fg0| and |fg1|
+	csel	x17, x17, x21, hs
+	csel	x2,   x2,   x25, hs
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, .Loop_30
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+.size	__inner_loop_30,.-__inner_loop_30
+.type	__inner_loop_48, %function
+.align	4
+__inner_loop_48:
+.Loop_48:
+	sbfx	x24, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x3, x9
+	sub	x15, x15, #1
+	and	x21, x9, x24
+	sub	x22, x9, x3		// |b_|-|a_|
+	subs	x23, x3, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x3, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x2,   x2,   x25, hs
+	add	x23, x9, #2
+	lsr	x3, x3, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, .Loop_48
+
+	ret
+.size	__inner_loop_48,.-__inner_loop_48
diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
new file mode 100644
index 00000000000..fec1493cb12
--- /dev/null
+++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
@@ -0,0 +1,479 @@
+.text	
+
+.globl	ct_is_square_mod_384
+.type	ct_is_square_mod_384,@function
+.align	32
+ct_is_square_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$536,%rsp
+.cfi_adjust_cfa_offset	536
+
+
+	leaq	24+255(%rsp),%rax
+	andq	$-256,%rax
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%rdi
+	movq	%rax,%rsi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rcx,72(%rax)
+	movq	%rdx,80(%rax)
+	movq	%rdi,88(%rax)
+
+	xorq	%rbp,%rbp
+	movl	$24,%ecx
+	jmp	.Loop_is_square
+
+.align	32
+.Loop_is_square:
+	movl	%ecx,16(%rsp)
+
+	call	__ab_approximation_30
+	movq	%rax,0(%rsp)
+	movq	%rbx,8(%rsp)
+
+	movq	$128+48,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	-48(%rdi),%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movl	16(%rsp),%ecx
+	xorq	$128,%rsi
+
+	andq	48(%rdi),%r14
+	shrq	$1,%r14
+	addq	%r14,%rbp
+
+	subl	$1,%ecx
+	jnz	.Loop_is_square
+
+
+
+
+	movq	48(%rsi),%r9
+	call	__inner_loop_48
+
+	movq	$1,%rax
+	andq	%rbp,%rax
+	xorq	$1,%rax
+
+	leaq	536(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-536-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_is_square_mod_384,.-ct_is_square_mod_384
+
+.type	__smulq_384_n_shift_by_30,@function
+.align	32
+__smulq_384_n_shift_by_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r14
+	andq	%rbx,%r14
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r14
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r15
+	andq	%rbx,%r15
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r15
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r15
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%r15,%r14
+
+	shrdq	$30,%r9,%r8
+	shrdq	$30,%r10,%r9
+	shrdq	$30,%r11,%r10
+	shrdq	$30,%r12,%r11
+	shrdq	$30,%r13,%r12
+	shrdq	$30,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbx,%rbx
+	subq	%r14,%rbx
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
+.type	__ab_approximation_30,@function
+.align	32
+__ab_approximation_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	88(%rsi),%rbx
+	movq	80(%rsi),%r15
+	movq	72(%rsi),%r14
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r11,%r12
+	movq	64(%rsi),%r11
+	cmovzq	%r14,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r10,%r12
+	movq	56(%rsi),%r10
+	cmovzq	%r11,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r9,%r12
+	movq	48(%rsi),%r9
+	cmovzq	%r10,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r8,%r12
+	cmovzq	%r9,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r13
+	cmovzq	%r9,%rbx
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%r12,%r13
+	shldq	%cl,%r15,%rbx
+
+	movq	$0xFFFFFFFF00000000,%rax
+	movl	%r8d,%r8d
+	movl	%r9d,%r9d
+	andq	%rax,%r13
+	andq	%rax,%rbx
+	orq	%r13,%r8
+	orq	%rbx,%r9
+
+	jmp	__inner_loop_30
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_30,.-__ab_approximation_30
+.type	__inner_loop_30,@function
+.align	32
+__inner_loop_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rbx
+	movq	$0x800000007FFFFFFF,%rcx
+	leaq	-1(%rbx),%r15
+	movl	$30,%edi
+
+.Loop_30:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbx,%r12
+	movq	%rcx,%r13
+	movq	%rbp,%r14
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rcx,%rbx
+	cmovbq	%r12,%rcx
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+	subq	%rcx,%rbx
+	addq	%r15,%rbx
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbx
+	cmovzq	%r13,%rcx
+	cmovzq	%r14,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rcx,%rcx
+	leaq	(%rax,%rbp,1),%rbp
+	subq	%r15,%rcx
+
+	subl	$1,%edi
+	jnz	.Loop_30
+
+	shrq	$32,%r15
+	movl	%ebx,%eax
+	shrq	$32,%rbx
+	movl	%ecx,%edx
+	shrq	$32,%rcx
+	subq	%r15,%rax
+	subq	%r15,%rbx
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_30,.-__inner_loop_30
+
+.type	__inner_loop_48,@function
+.align	32
+__inner_loop_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	$48,%edi
+
+.Loop_48:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbp,%r12
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rax,%rbp
+
+	subl	$1,%edi
+	jnz	.Loop_48
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_48,.-__inner_loop_48
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..b702262f6e5
--- /dev/null
+++ b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
@@ -0,0 +1,1195 @@
+.text	
+
+.globl	ct_inverse_mod_383
+.type	ct_inverse_mod_383,@function
+.align	32
+ct_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	48(%rsi),%r10
+	movq	56(%rsi),%r11
+	call	__inner_loop_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	movq	%r8,0(%rdi)
+	movq	%r10,48(%rdi)
+
+
+
+	leaq	96(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+
+	xorq	$256+96,%rsi
+	movl	$22,%edi
+
+	movq	0(%rsi),%r8
+	xorq	%r9,%r9
+	movq	48(%rsi),%r10
+	xorq	%r11,%r11
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ct_inverse_mod_383,.-ct_inverse_mod_383
+.type	__smulq_767x63,@function
+.align	32
+__smulq_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	%r9,8(%rdi)
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r10,16(%rdi)
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r11,24(%rdi)
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r12,32(%rdi)
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	movq	%rdx,%rsi
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rsi
+	addq	%rax,%rsi
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rbx
+	xorq	%rdx,%rbp
+	xorq	%rdx,%rcx
+	xorq	%rdx,%rdi
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulq	%rsi
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rsi
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rsi
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rsi
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rsi
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+	mulq	%rsi
+	addq	%rax,%rbp
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+	movq	8(%rsp),%rdx
+	imulq	%rsi,%rax
+	movq	16(%rsp),%rsi
+	addq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_767x63,.-__smulq_767x63
+.type	__smulq_383x63,@function
+.align	32
+__smulq_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_383x63,.-__smulq_383x63
+.type	__smulq_383_n_shift_by_62,@function
+.align	32
+__smulq_383_n_shift_by_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	48(%rsi),%rsi
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$62,%r9,%r8
+	shrdq	$62,%r10,%r9
+	shrdq	$62,%r11,%r10
+	shrdq	$62,%r12,%r11
+	shrdq	$62,%r13,%r12
+	shrdq	$62,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
+.type	__ab_approximation_62,@function
+.align	32
+__ab_approximation_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	16(%rsi),%r8
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	8(%rsi),%r8
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	0(%rsi),%r8
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	jmp	__inner_loop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_62,.-__ab_approximation_62
+.type	__inner_loop_62,@function
+.align	8
+.long	0
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+	movq	%rsi,8(%rsp)
+
+.Loop_62:
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	testq	$1,%r8
+	movq	%r10,%rbp
+	movq	%r11,%r14
+	cmovnzq	%r10,%rax
+	cmovnzq	%r11,%rbx
+	subq	%r8,%rbp
+	sbbq	%r9,%r14
+	movq	%r8,%r15
+	movq	%r9,%rsi
+	subq	%rax,%r8
+	sbbq	%rbx,%r9
+	cmovcq	%rbp,%r8
+	cmovcq	%r14,%r9
+	cmovcq	%r15,%r10
+	cmovcq	%rsi,%r11
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrdq	$1,%r9,%r8
+	shrq	$1,%r9
+	testq	$1,%r15
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	movq	8(%rsp),%rsi
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62,.-__inner_loop_62
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..25a5fa5345f
--- /dev/null
+++ b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
@@ -0,0 +1,1574 @@
+.text	
+
+.globl	ctx_inverse_mod_383
+.type	ctx_inverse_mod_383,@function
+.align	32
+ctx_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+
+	xorq	$256+96,%rsi
+	movl	$53,%edi
+
+	movq	0(%rsi),%r8
+
+	movq	48(%rsi),%r10
+
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulx_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
+.type	__smulx_767x63,@function
+.align	32
+__smulx_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+	movq	%rcx,%rax
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	sarq	$63,%rax
+	xorq	%rsi,%rsi
+	subq	%rax,%rsi
+
+	xorq	%rax,%rdx
+	addq	%rsi,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%rax,%r13
+	xorq	%rax,%r14
+	xorq	%rax,%r15
+	xorq	%rax,%rbx
+	xorq	%rax,%rbp
+	xorq	%rax,%rcx
+	xorq	%rax,%rdi
+	addq	%rsi,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulxq	%r8,%r8,%rax
+	mulxq	%r9,%r9,%rsi
+	addq	%rax,%r9
+	mulxq	%r10,%r10,%rax
+	adcq	%rsi,%r10
+	mulxq	%r11,%r11,%rsi
+	adcq	%rax,%r11
+	mulxq	%r12,%r12,%rax
+	adcq	%rsi,%r12
+	mulxq	%r13,%r13,%rsi
+	adcq	%rax,%r13
+	mulxq	%r14,%r14,%rax
+	adcq	%rsi,%r14
+	mulxq	%r15,%r15,%rsi
+	adcq	%rax,%r15
+	mulxq	%rbx,%rbx,%rax
+	adcq	%rsi,%rbx
+	mulxq	%rbp,%rbp,%rsi
+	adcq	%rax,%rbp
+	mulxq	%rcx,%rcx,%rax
+	adcq	%rsi,%rcx
+	mulxq	%rdi,%rdi,%rsi
+	movq	8(%rsp),%rdx
+	movq	16(%rsp),%rsi
+	adcq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_767x63,.-__smulx_767x63
+.type	__smulx_383x63,@function
+.align	32
+__smulx_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	movq	%rcx,%rdx
+	adcq	%rbp,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	adcq	%rbp,%r13
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_383x63,.-__smulx_383x63
+.type	__smulx_383_n_shift_by_31,@function
+.align	32
+__smulx_383_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	xorq	%r14,%r14
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	%rdx,%r14
+
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%rax
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%rax,%r12
+	shrdq	$31,%r14,%rax
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
+.type	__smulx_191_n_shift_by_31,@function
+.align	32
+__smulx_191_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%r10,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r10
+	addq	%rbp,%r9
+	adcq	$0,%r10
+	imulq	%rdx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+	movq	48+0(%rsi),%r11
+	movq	48+8(%rsi),%r12
+	movq	48+16(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r11,%r11,%rbp
+	mulxq	%r12,%r12,%r13
+	addq	%rbp,%r12
+	adcq	$0,%r13
+	imulq	%rdx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	%r9,%r12
+	adcq	%r10,%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%r13,%r12
+	shrdq	$31,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
+.type	__ab_approximation_31,@function
+.align	32
+__ab_approximation_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	16(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	8(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	andnq	%r9,%rax,%r9
+	andnq	%r11,%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__ab_approximation_31,.-__ab_approximation_31
+.type	__inner_loop_31,@function
+.align	32
+__inner_loop_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+.Loop_31:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edi
+	jnz	.Loop_31
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_31,.-__inner_loop_31
+
+.type	__inner_loop_62,@function
+.align	32
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+
+.Loop_62:
+	xorq	%rax,%rax
+	testq	$1,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	$1,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	.Loop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__inner_loop_62,.-__inner_loop_62
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S
new file mode 100644
index 00000000000..a2b1d676a36
--- /dev/null
+++ b/crypto/blst_src/build/elf/div3w-armv8.S
@@ -0,0 +1,88 @@
+.text
+
+.globl	div_3_limbs
+.type	div_3_limbs,%function
+.align	5
+div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+.Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,.Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.type	quot_rem_128,%function
+.align	5
+quot_rem_128:
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+.size	quot_rem_128,.-quot_rem_128
+
+.globl	quot_rem_64
+.type	quot_rem_64,%function
+.align	5
+quot_rem_64:
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+.size	quot_rem_64,.-quot_rem_64
diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s
new file mode 100644
index 00000000000..00ae5699824
--- /dev/null
+++ b/crypto/blst_src/build/elf/div3w-x86_64.s
@@ -0,0 +1,123 @@
+.text	
+
+.globl	div_3_limbs
+.hidden	div_3_limbs
+.type	div_3_limbs,@function
+.align	32
+div_3_limbs:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	xorq	%rax,%rax
+	movl	$64,%ecx
+
+.Loop:
+	movq	%r8,%r10
+	subq	%rsi,%r8
+	movq	%r9,%r11
+	sbbq	%rdx,%r9
+	leaq	1(%rax,%rax,1),%rax
+	movq	%rdx,%rdi
+	cmovcq	%r10,%r8
+	cmovcq	%r11,%r9
+	sbbq	$0,%rax
+	shlq	$63,%rdi
+	shrq	$1,%rsi
+	shrq	$1,%rdx
+	orq	%rdi,%rsi
+	subl	$1,%ecx
+	jnz	.Loop
+
+	leaq	1(%rax,%rax,1),%rcx
+	sarq	$63,%rax
+
+	subq	%rsi,%r8
+	sbbq	%rdx,%r9
+	sbbq	$0,%rcx
+
+	orq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	div_3_limbs,.-div_3_limbs
+.globl	quot_rem_128
+.hidden	quot_rem_128
+.type	quot_rem_128,@function
+.align	32
+quot_rem_128:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	movq	%rdx,%rcx
+
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	adcq	$0,%rdx
+
+	movq	0(%rdi),%r10
+	movq	8(%rdi),%r11
+	movq	16(%rdi),%rax
+
+	subq	%r8,%r10
+	sbbq	%r9,%r11
+	sbbq	%rdx,%rax
+	sbbq	%r8,%r8
+
+	addq	%r8,%rcx
+	movq	%r8,%r9
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	addq	%r8,%r10
+	adcq	%r9,%r11
+
+	movq	%r10,0(%rdi)
+	movq	%r11,8(%rdi)
+	movq	%rcx,16(%rdi)
+
+	movq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_128,.-quot_rem_128
+
+
+
+
+
+.globl	quot_rem_64
+.hidden	quot_rem_64
+.type	quot_rem_64,@function
+.align	32
+quot_rem_64:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	imulq	0(%rsi),%rdx
+
+	movq	0(%rdi),%r10
+
+	subq	%rdx,%r10
+
+	movq	%r10,0(%rdi)
+	movq	%rax,8(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	quot_rem_64,.-quot_rem_64
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/mul_mont_256-armv8.S b/crypto/blst_src/build/elf/mul_mont_256-armv8.S
new file mode 100644
index 00000000000..8bb1197f464
--- /dev/null
+++ b/crypto/blst_src/build/elf/mul_mont_256-armv8.S
@@ -0,0 +1,464 @@
+.text
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,%function
+.align	5
+mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csel	x19,x19,x14,lo
+	csel	x20,x20,x15,lo
+	csel	x21,x21,x16,lo
+	csel	x22,x22,x17,lo
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,%function
+.align	5
+sqr_mont_sparse_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,%function
+.align	5
+from_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,%function
+.align	5
+redc_mont_256:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	.inst	0xd50323bf
+	ret
+.size	redc_mont_256,.-redc_mont_256
+
+.type	__mul_by_1_mont_256,%function
+.align	5
+__mul_by_1_mont_256:
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
diff --git a/crypto/blst_src/build/elf/mul_mont_384-armv8.S b/crypto/blst_src/build/elf/mul_mont_384-armv8.S
new file mode 100644
index 00000000000..c048e816b85
--- /dev/null
+++ b/crypto/blst_src/build/elf/mul_mont_384-armv8.S
@@ -0,0 +1,2372 @@
+.text
+
+.globl	add_mod_384x384
+.type	add_mod_384x384,%function
+.align	5
+add_mod_384x384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	.inst	0xd50323bf
+	ret
+.size	add_mod_384x384,.-add_mod_384x384
+
+.type	__add_mod_384x384,%function
+.align	5
+__add_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	adcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	adcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	adcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	stp	x11,x12,[x0,#48]
+	csel	x15,x15,x23,lo
+	stp	x13,x14,[x0,#64]
+	csel	x16,x16,x24,lo
+	stp	x15,x16,[x0,#80]
+
+	ret
+.size	__add_mod_384x384,.-__add_mod_384x384
+
+.globl	sub_mod_384x384
+.type	sub_mod_384x384,%function
+.align	5
+sub_mod_384x384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	.inst	0xd50323bf
+	ret
+.size	sub_mod_384x384,.-sub_mod_384x384
+
+.type	__sub_mod_384x384,%function
+.align	5
+__sub_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	sbcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	sbcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	sbcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	ret
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,%function
+.align	5
+__add_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	stp	x11,x12,[x0]
+	csel	x16,x16,x24,lo
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,%function
+.align	5
+__sub_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__sub_mod_384,.-__sub_mod_384
+
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,%function
+.align	5
+mul_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	x26,x0		// save r_ptr
+	mov	x27,x1		// save b_ptr
+	mov	x28,x2		// save b_ptr
+
+	sub	x0,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	x1,x1,#48	// mul_384(t1, a->im, b->im)
+	add	x2,x2,#48
+	add	x0,sp,#96
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	sub	x2,x1,#48
+	add	x0,sp,#240
+	bl	__add_mod_384
+
+	add	x1,x28,#0
+	add	x2,x28,#48
+	add	x0,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	x1,x0,#0
+	add	x2,x0,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,x0
+	add	x2,sp,#0
+	bl	__sub_mod_384x384
+
+	add	x2,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	x1,sp,#0
+	add	x2,sp,#96
+	add	x0,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	x1,sp,#0		// ret->re = redc(t0)
+	add	x0,x26,#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	x1,sp,#192		// ret->im = redc(t2)
+	add	x0,x0,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_mont_384x,.-mul_mont_384x
+
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,%function
+.align	5
+sqr_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	add	x0,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	x0,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x11,x11,x11	// add with itself
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x19,x11,x19,lo
+	csel	x20,x12,x20,lo
+	csel	x21,x13,x21,lo
+	ldp	x11,x12,[sp]
+	csel	x22,x14,x22,lo
+	ldr	x17,        [sp,#48]
+	csel	x23,x15,x23,lo
+	ldp	x13,x14,[sp,#16]
+	csel	x24,x16,x24,lo
+	ldp	x15,x16,[sp,#32]
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	add	x2,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,%function
+.align	5
+mul_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_mont_384,.-mul_mont_384
+
+.type	__mul_mont_384,%function
+.align	5
+__mul_mont_384:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	mov	x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*1]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*2]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*3]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*4]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*5]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]	// pull r_ptr
+	adc	x17,x17,xzr
+
+	adds	x19,x20,x26
+	adcs	x20,x21,x27
+	adcs	x21,x22,x28
+	adcs	x22,x23,x0
+	adcs	x23,x24,x1
+	adcs	x24,x25,x3
+	adc	x25,x17,xzr
+
+	subs	x26,x19,x5
+	sbcs	x27,x20,x6
+	sbcs	x28,x21,x7
+	sbcs	x0,x22,x8
+	sbcs	x1,x23,x9
+	sbcs	x3,x24,x10
+	sbcs	xzr,    x25,xzr
+
+	csel	x11,x19,x26,lo
+	csel	x12,x20,x27,lo
+	csel	x13,x21,x28,lo
+	csel	x14,x22,x0,lo
+	csel	x15,x23,x1,lo
+	csel	x16,x24,x3,lo
+	ret
+.size	__mul_mont_384,.-__mul_mont_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,%function
+.align	5
+sqr_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x4,x3		// adjust for missing b_ptr
+
+	mov	x3,x0		// save r_ptr
+	mov	x0,sp
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	mov	x1,sp
+	mov	x0,x3		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_384,.-sqr_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,%function
+.align	5
+sqr_n_mul_mont_383:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x17,x5			// save b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+	mov	x0,sp
+.Loop_sqr_383:
+	bl	__sqr_384
+	sub	x2,x2,#1	// counter
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// just accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	cbnz	x2,.Loop_sqr_383
+
+	mov	x2,x17
+	ldr	x17,[x17]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+.type	__sqr_384,%function
+.align	5
+__sqr_384:
+	mul	x19,x12,x11
+	mul	x20,x13,x11
+	mul	x21,x14,x11
+	mul	x22,x15,x11
+	mul	x23,x16,x11
+
+	umulh	x6,x12,x11
+	umulh	x7,x13,x11
+	umulh	x8,x14,x11
+	umulh	x9,x15,x11
+	adds	x20,x20,x6
+	umulh	x10,x16,x11
+	adcs	x21,x21,x7
+	mul	x7,x13,x12
+	adcs	x22,x22,x8
+	mul	x8,x14,x12
+	adcs	x23,x23,x9
+	mul	x9,x15,x12
+	adc	x24,xzr,    x10
+	mul	x10,x16,x12
+
+	adds	x21,x21,x7
+	umulh	x7,x13,x12
+	adcs	x22,x22,x8
+	umulh	x8,x14,x12
+	adcs	x23,x23,x9
+	umulh	x9,x15,x12
+	adcs	x24,x24,x10
+	umulh	x10,x16,x12
+	adc	x25,xzr,xzr
+
+	mul	x5,x11,x11
+	adds	x22,x22,x7
+	umulh	x11,  x11,x11
+	adcs	x23,x23,x8
+	mul	x8,x14,x13
+	adcs	x24,x24,x9
+	mul	x9,x15,x13
+	adc	x25,x25,x10
+	mul	x10,x16,x13
+
+	adds	x23,x23,x8
+	umulh	x8,x14,x13
+	adcs	x24,x24,x9
+	umulh	x9,x15,x13
+	adcs	x25,x25,x10
+	umulh	x10,x16,x13
+	adc	x26,xzr,xzr
+
+	mul	x6,x12,x12
+	adds	x24,x24,x8
+	umulh	x12,  x12,x12
+	adcs	x25,x25,x9
+	mul	x9,x15,x14
+	adc	x26,x26,x10
+	mul	x10,x16,x14
+
+	adds	x25,x25,x9
+	umulh	x9,x15,x14
+	adcs	x26,x26,x10
+	umulh	x10,x16,x14
+	adc	x27,xzr,xzr
+	mul	x7,x13,x13
+	adds	x26,x26,x9
+	umulh	x13,  x13,x13
+	adc	x27,x27,x10
+	mul	x8,x14,x14
+
+	mul	x10,x16,x15
+	umulh	x14,  x14,x14
+	adds	x27,x27,x10
+	umulh	x10,x16,x15
+	mul	x9,x15,x15
+	adc	x28,x10,xzr
+
+	adds	x19,x19,x19
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adcs	x22,x22,x22
+	adcs	x23,x23,x23
+	adcs	x24,x24,x24
+	adcs	x25,x25,x25
+	adcs	x26,x26,x26
+	umulh	x15,  x15,x15
+	adcs	x27,x27,x27
+	mul	x10,x16,x16
+	adcs	x28,x28,x28
+	umulh	x16,  x16,x16
+	adc	x1,xzr,xzr
+
+	adds	x19,x19,x11
+	adcs	x20,x20,x6
+	adcs	x21,x21,x12
+	adcs	x22,x22,x7
+	adcs	x23,x23,x13
+	adcs	x24,x24,x8
+	adcs	x25,x25,x14
+	stp	x5,x19,[x0]
+	adcs	x26,x26,x9
+	stp	x20,x21,[x0,#16]
+	adcs	x27,x27,x15
+	stp	x22,x23,[x0,#32]
+	adcs	x28,x28,x10
+	stp	x24,x25,[x0,#48]
+	adc	x16,x16,x1
+	stp	x26,x27,[x0,#64]
+	stp	x28,x16,[x0,#80]
+
+	ret
+.size	__sqr_384,.-__sqr_384
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,%function
+.align	5
+sqr_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_384,.-sqr_384
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,%function
+.align	5
+redc_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	redc_mont_384,.-redc_mont_384
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,%function
+.align	5
+from_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	from_mont_384,.-from_mont_384
+
+.type	__mul_by_1_mont_384,%function
+.align	5
+__mul_by_1_mont_384:
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	mul	x26,x4,x11
+	ldp	x15,x16,[x1,#32]
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	ret
+.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384
+
+.type	__redc_tail_mont_384,%function
+.align	5
+__redc_tail_mont_384:
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,%function
+.align	5
+mul_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_384,.-mul_384
+
+.type	__mul_384,%function
+.align	5
+__mul_384:
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+
+	umulh	x5,x11,x17
+	umulh	x6,x12,x17
+	umulh	x7,x13,x17
+	umulh	x8,x14,x17
+	umulh	x9,x15,x17
+	umulh	x10,x16,x17
+	ldr	x17,[x2,8*1]
+
+	str	x19,[x0]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,xzr,    x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(1+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*1]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(2+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*2]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(3+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*3]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(4+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*4]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*5]
+	adds	x19,x20,x5
+	adcs	x20,x21,x6
+	adcs	x21,x22,x7
+	adcs	x22,x23,x8
+	adcs	x23,x24,x9
+	adc	x24,x25,x10
+
+	stp	x19,x20,[x0,#48]
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ret
+.size	__mul_384,.-__mul_384
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,%function
+.align	5
+mul_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	x11,x12,[x1]
+	mov	x26,x0		// save r_ptr
+	ldp	x19,x20,[x1,#48]
+	mov	x27,x1		// save a_ptr
+	ldp	x13,x14,[x1,#16]
+	mov	x28,x2		// save b_ptr
+	ldp	x21,x22,[x1,#64]
+	ldp	x15,x16,[x1,#32]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x23,x24,[x1,#80]
+	adcs	x6,x12,x20
+	ldp	x11,x12,[x2]
+	adcs	x7,x13,x21
+	ldp	x19,x20,[x2,#48]
+	adcs	x8,x14,x22
+	ldp	x13,x14,[x2,#16]
+	adcs	x9,x15,x23
+	ldp	x21,x22,[x2,#64]
+	adc	x10,x16,x24
+	ldp	x15,x16,[x2,#32]
+
+	stp	x5,x6,[sp]
+	adds	x5,x11,x19	// t1 = b->re + b->im
+	ldp	x23,x24,[x2,#80]
+	adcs	x6,x12,x20
+	stp	x7,x8,[sp,#16]
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	stp	x9,x10,[sp,#32]
+	adcs	x9,x15,x23
+	stp	x5,x6,[sp,#48]
+	adc	x10,x16,x24
+	stp	x7,x8,[sp,#64]
+	stp	x9,x10,[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	x1,sp,#0		// mul_384(ret->im, t0, t1)
+	add	x2,sp,#48
+	add	x0,x26,#96
+	bl	__mul_384
+
+	add	x1,x27,#48	// mul_384(tx, a->im, b->im)
+	add	x2,x28,#48
+	add	x0,sp,#0
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	add	x1,x26,#96	// ret->im -= tx
+	add	x2,sp,#0
+	add	x0,x26,#96
+	bl	__sub_mod_384x384
+
+	add	x2,x26,#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	x1,x26,#0	// ret->re -= tx
+	add	x2,sp,#0
+	add	x0,x26,#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	mul_382x,.-mul_382x
+
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,%function
+.align	5
+sqr_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x19,x20,[x1,#48]
+	ldp	x13,x14,[x1,#16]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x21,x22,[x1,#64]
+	adcs	x6,x12,x20
+	ldp	x15,x16,[x1,#32]
+	adcs	x7,x13,x21
+	ldp	x23,x24,[x1,#80]
+	adcs	x8,x14,x22
+	stp	x5,x6,[x0]
+	adcs	x9,x15,x23
+	ldp	x5,x6,[x2]
+	adc	x10,x16,x24
+	stp	x7,x8,[x0,#16]
+
+	subs	x11,x11,x19	// t1 = a->re - a->im
+	ldp	x7,x8,[x2,#16]
+	sbcs	x12,x12,x20
+	stp	x9,x10,[x0,#32]
+	sbcs	x13,x13,x21
+	ldp	x9,x10,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	adds	x11,x11,x19
+	and	x21,x7,x25
+	adcs	x12,x12,x20
+	and	x22,x8,x25
+	adcs	x13,x13,x21
+	and	x23,x9,x25
+	adcs	x14,x14,x22
+	and	x24,x10,x25
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	mov	x4,x1		// save a_ptr
+	add	x1,x0,#0	// mul_384(ret->re, t0, t1)
+	add	x2,x0,#48
+	bl	__mul_384
+
+	add	x1,x4,#0		// mul_384(ret->im, a->re, a->im)
+	add	x2,x4,#48
+	add	x0,x0,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x11,x12,[x0]
+	ldp	x13,x14,[x0,#16]
+	adds	x11,x11,x11	// add with itself
+	ldp	x15,x16,[x0,#32]
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adcs	x19,x19,x19
+	adcs	x20,x20,x20
+	stp	x11,x12,[x0]
+	adcs	x21,x21,x21
+	stp	x13,x14,[x0,#16]
+	adcs	x22,x22,x22
+	stp	x15,x16,[x0,#32]
+	adcs	x23,x23,x23
+	stp	x19,x20,[x0,#48]
+	adc	x24,x24,x24
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_382x,.-sqr_382x
+
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,%function
+.align	5
+sqr_mont_382x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x17,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x5,x11,x17	// t0 = a->re + a->im
+	adcs	x6,x12,x20
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	adcs	x9,x15,x23
+	adc	x10,x16,x24
+
+	subs	x19,x11,x17	// t1 = a->re - a->im
+	sbcs	x20,x12,x20
+	sbcs	x21,x13,x21
+	sbcs	x22,x14,x22
+	sbcs	x23,x15,x23
+	sbcs	x24,x16,x24
+	sbc	x25,xzr,xzr		// borrow flag as mask
+
+	stp	x5,x6,[sp]
+	stp	x7,x8,[sp,#16]
+	stp	x9,x10,[sp,#32]
+	stp	x19,x20,[sp,#48]
+	stp	x21,x22,[sp,#64]
+	stp	x23,x24,[sp,#80]
+	str	x25,[sp,#96]
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x19,x11,x11	// add with itself
+	adcs	x20,x12,x12
+	adcs	x21,x13,x13
+	adcs	x22,x14,x14
+	adcs	x23,x15,x15
+	adc	x24,x16,x16
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	ldp	x11,x12,[sp]
+	ldr	x17,[sp,#48]
+	ldp	x13,x14,[sp,#16]
+	ldp	x15,x16,[sp,#32]
+
+	add	x2,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	x25,[sp,#96]	// account for sign from a->re - a->im
+	ldp	x19,x20,[sp]
+	ldp	x21,x22,[sp,#16]
+	ldp	x23,x24,[sp,#32]
+
+	and	x19,x19,x25
+	and	x20,x20,x25
+	and	x21,x21,x25
+	and	x22,x22,x25
+	and	x23,x23,x25
+	and	x24,x24,x25
+
+	subs	x11,x11,x19
+	sbcs	x12,x12,x20
+	sbcs	x13,x13,x21
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	and	x21,x7,x25
+	and	x22,x8,x25
+	and	x23,x9,x25
+	and	x24,x10,x25
+
+	adds	x11,x11,x19
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.type	__mul_mont_383_nonred,%function
+.align	5
+__mul_mont_383_nonred:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*1]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*2]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*3]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*4]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*5]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]		// pull r_ptr
+
+	adds	x11,x20,x26
+	adcs	x12,x21,x27
+	adcs	x13,x22,x28
+	adcs	x14,x23,x0
+	adcs	x15,x24,x1
+	adcs	x16,x25,x3
+
+	ret
+.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,%function
+.align	5
+sgn0_pty_mont_384:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	adds	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,%function
+.align	5
+sgn0_pty_mont_384x:
+	.inst	0xd503233f
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	add	x1,x1,#48
+
+	and	x2,x11,#1
+	orr	x3,x11,x12
+	adds	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	orr	x3,x3,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x2,x2,x17
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	orr	x1,x11,x12
+	adds	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	orr	x1,x1,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	.inst	0xd50323bf
+	ret
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
new file mode 100644
index 00000000000..37abd4392d3
--- /dev/null
+++ b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
@@ -0,0 +1,714 @@
+.text	
+
+.globl	mul_mont_sparse_256
+.hidden	mul_mont_sparse_256
+.type	mul_mont_sparse_256,@function
+.align	32
+mul_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r13
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%rbp
+	movq	%rdx,%rbx
+
+	movq	%rax,%r15
+	mulq	%r13
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_sparse_256,.-mul_mont_sparse_256
+
+.globl	sqr_mont_sparse_256
+.hidden	sqr_mont_sparse_256
+.type	sqr_mont_sparse_256,@function
+.align	32
+sqr_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rax
+	movq	%rcx,%r8
+	movq	8(%rsi),%r14
+	movq	%rdx,%rcx
+	movq	16(%rsi),%r12
+	leaq	(%rsi),%rbx
+	movq	24(%rsi),%rbp
+
+	movq	%rax,%r15
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
+.type	__mulq_mont_sparse_256,@function
+.align	32
+__mulq_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	xorq	%r14,%r14
+	movq	%rdx,%r13
+
+	movq	%r9,%rdi
+	imulq	%r8,%r9
+
+
+	movq	%rax,%r15
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	xorq	%r15,%r15
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r9,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rdx,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	movq	%r10,%rdi
+	imulq	%r8,%r10
+
+
+	movq	%rax,%r9
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	xorq	%r9,%r9
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r10,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rdx,%r14
+	adcq	$0,%r15
+	adcq	$0,%r9
+	movq	%r11,%rdi
+	imulq	%r8,%r11
+
+
+	movq	%rax,%r10
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r9
+	xorq	%r10,%r10
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r11,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rdx,%r15
+	adcq	$0,%r9
+	adcq	$0,%r10
+	imulq	%r8,%rax
+	movq	8(%rsp),%rsi
+
+
+	movq	%rax,%r11
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	movq	%r14,%rbx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+
+	movq	%r15,%r12
+	subq	0(%rcx),%r13
+	sbbq	8(%rcx),%r14
+	sbbq	16(%rcx),%r15
+	movq	%r9,%rbp
+	sbbq	24(%rcx),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rax,%r13
+	cmovcq	%rbx,%r14
+	cmovcq	%r12,%r15
+	movq	%r13,0(%rsi)
+	cmovcq	%rbp,%r9
+	movq	%r14,8(%rsi)
+	movq	%r15,16(%rsi)
+	movq	%r9,24(%rsi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
+.globl	from_mont_256
+.hidden	from_mont_256
+.type	from_mont_256,@function
+.align	32
+from_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	movq	%r14,%r10
+	movq	%r15,%r11
+	movq	%r9,%r12
+
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	sbbq	24(%rbx),%r9
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	from_mont_256,.-from_mont_256
+
+.globl	redc_mont_256
+.hidden	redc_mont_256
+.type	redc_mont_256,@function
+.align	32
+redc_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+	addq	32(%rsi),%r13
+	adcq	40(%rsi),%r14
+	movq	%r13,%rax
+	adcq	48(%rsi),%r15
+	movq	%r14,%r10
+	adcq	56(%rsi),%r9
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r15,%r11
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	movq	%r9,%r12
+	sbbq	24(%rbx),%r9
+	sbbq	$0,%rsi
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redc_mont_256,.-redc_mont_256
+.type	__mulq_by_1_mont_256,@function
+.align	32
+__mulq_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	movq	%rax,%r13
+	imulq	%rcx,%rax
+	movq	%rax,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	16(%rbx)
+	movq	%r10,%r14
+	imulq	%rcx,%r10
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r9
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
new file mode 100644
index 00000000000..fa9dd3529ad
--- /dev/null
+++ b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
@@ -0,0 +1,3620 @@
+.text	
+
+
+
+
+
+
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+.globl	mul_mont_384x
+.hidden	mul_mont_384x
+.type	mul_mont_384x,@function
+.align	32
+mul_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	48(%rsi),%rsi
+	leaq	40+96(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	-48(%rsi),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulq_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	movq	%rcx,%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_384x,.-mul_mont_384x
+.globl	sqr_mont_384x
+.hidden	sqr_mont_384x
+.type	sqr_mont_384x,@function
+.align	32
+sqr_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	call	__mulq_mont_384
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	movq	%r14,%r12
+	adcq	%r9,%r9
+	movq	%r15,%r13
+	adcq	%r10,%r10
+	movq	%r8,%rax
+	adcq	%r11,%r11
+	movq	%r9,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r10,%rbp
+	sbbq	16(%rcx),%r8
+	sbbq	24(%rcx),%r9
+	sbbq	32(%rcx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r14
+	cmovcq	%r13,%r15
+	cmovcq	%rax,%r8
+	movq	%r14,48(%rdi)
+	cmovcq	%rbx,%r9
+	movq	%r15,56(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,64(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_384x,.-sqr_mont_384x
+
+.globl	mul_382x
+.hidden	mul_382x
+.type	mul_382x,@function
+.align	32
+mul_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulq_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_382x,.-mul_382x
+.globl	sqr_382x
+.hidden	sqr_382x
+.type	sqr_382x,@function
+.align	32
+sqr_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulq_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulq_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_382x,.-sqr_382x
+.globl	mul_384
+.hidden	mul_384
+.type	mul_384,@function
+.align	32
+mul_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rbx
+	call	__mulq_384
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_384,.-mul_384
+
+.type	__mulq_384,@function
+.align	32
+__mulq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rax
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	movq	%rax,0(%rdi)
+	movq	%rbp,%rax
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,16(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,24(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,32(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,40(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	%rax,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rcx,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+	movq	%r10,72(%rdi)
+	movq	%r11,80(%rdi)
+	movq	%r12,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_384,.-__mulq_384
+.globl	sqr_384
+.hidden	sqr_384
+.type	sqr_384,@function
+.align	32
+sqr_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrq_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_384,.-sqr_384
+
+.type	__sqrq_384,@function
+.align	32
+__sqrq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rbx
+
+
+	movq	%rax,%r14
+	mulq	%r15
+	movq	%rax,%r9
+	movq	%r14,%rax
+	movq	32(%rsi),%rbp
+	movq	%rdx,%r10
+
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	40(%rsi),%rsi
+	movq	%rdx,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	%rax
+	xorq	%r8,%r8
+	movq	%rax,0(%rdi)
+	movq	%r15,%rax
+	addq	%r9,%r9
+	adcq	$0,%r8
+	addq	%rdx,%r9
+	adcq	$0,%r8
+	movq	%r9,8(%rdi)
+
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rax
+	xorq	%r9,%r9
+	addq	%rax,%r8
+	movq	%rcx,%rax
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%r9
+	addq	%r8,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r9
+	movq	%r10,16(%rdi)
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rbp
+	addq	%rax,%r14
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%rbx,%rax
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	$0,%r11
+	addq	%r9,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r11
+	movq	%r12,32(%rdi)
+
+
+	mulq	%rbp
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r13,40(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	mulq	%rax
+	xorq	%r12,%r12
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r12
+	addq	%r11,%r14
+	adcq	%rdx,%r15
+	movq	%r14,48(%rdi)
+	adcq	$0,%r12
+	movq	%r15,56(%rdi)
+
+
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	xorq	%r13,%r13
+	addq	%rax,%r12
+	movq	%rsi,%rax
+	addq	%rcx,%rcx
+	adcq	%rbx,%rbx
+	adcq	$0,%r13
+	addq	%r12,%rcx
+	adcq	%rdx,%rbx
+	movq	%rcx,64(%rdi)
+	adcq	$0,%r13
+	movq	%rbx,72(%rdi)
+
+
+	mulq	%rax
+	addq	%r13,%rax
+	addq	%rbp,%rbp
+	adcq	$0,%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rax,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sqrq_384,.-__sqrq_384
+
+.globl	sqr_mont_384
+.hidden	sqr_mont_384
+.type	sqr_mont_384,@function
+.align	32
+sqr_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$120,%rsp
+.cfi_adjust_cfa_offset	8*15
+
+
+	movq	%rcx,96(%rsp)
+	movq	%rdx,104(%rsp)
+	movq	%rdi,112(%rsp)
+
+	movq	%rsp,%rdi
+	call	__sqrq_384
+
+	leaq	0(%rsp),%rsi
+	movq	96(%rsp),%rcx
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	120(%rsp),%r8
+	movq	120(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*21
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_384,.-sqr_mont_384
+
+
+
+.globl	redc_mont_384
+.hidden	redc_mont_384
+.type	redc_mont_384,@function
+.align	32
+redc_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redc_mont_384,.-redc_mont_384
+
+
+
+
+.globl	from_mont_384
+.hidden	from_mont_384
+.type	from_mont_384,@function
+.align	32
+from_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+
+
+
+
+
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	from_mont_384,.-from_mont_384
+.type	__mulq_by_1_mont_384,@function
+.align	32
+__mulq_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%r9,%r15
+	imulq	%rcx,%r9
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	32(%rbx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	40(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%r10,%r8
+	imulq	%rcx,%r10
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r8
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r11,%r9
+	imulq	%rcx,%r11
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
+
+.type	__redc_tail_mont_384,@function
+.align	32
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0_pty_mont_384
+.hidden	sgn0_pty_mont_384
+.type	sgn0_pty_mont_384,@function
+.align	32
+sgn0_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
+
+.globl	sgn0_pty_mont_384x
+.hidden	sgn0_pty_mont_384x
+.type	sgn0_pty_mont_384x,@function
+.align	32
+sgn0_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
+.globl	mul_mont_384
+.hidden	mul_mont_384
+.type	mul_mont_384,@function
+.align	32
+mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	%rdx,%rbx
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+
+	call	__mulq_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-72
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mul_mont_384,.-mul_mont_384
+.type	__mulq_mont_384,@function
+.align	32
+__mulq_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rdi
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rdi,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%rbp
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r15,%r15
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r8,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	adcq	$0,%r15
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r9,%rbp
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r9,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r10,%rbp
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r10,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r11,%rbp
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r11,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r12,%rbp
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r12,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	movq	%r13,%rbp
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rsi)
+	addq	%r12,%r10
+	adcq	$0,%rdx
+	xorq	%r12,%r12
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r13,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+
+
+
+	movq	16(%rsp),%rdi
+	subq	0(%rcx),%r14
+	movq	%r15,%rdx
+	sbbq	8(%rcx),%r15
+	movq	%r8,%rbx
+	sbbq	16(%rcx),%r8
+	movq	%r9,%rsi
+	sbbq	24(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	32(%rcx),%r10
+	movq	%r11,%r13
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rdx,%r15
+	cmovcq	%rbx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rsi,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%r13,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_mont_384,.-__mulq_mont_384
+.globl	sqr_n_mul_mont_384
+.hidden	sqr_n_mul_mont_384
+.type	sqr_n_mul_mont_384,@function
+.align	32
+sqr_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_384:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1,%edx
+	leaq	0(%rdi),%rsi
+	decl	%edx
+	jnz	.Loop_sqr_384
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_n_mul_mont_384,.-sqr_n_mul_mont_384
+
+.globl	sqr_n_mul_mont_383
+.hidden	sqr_n_mul_mont_383
+.type	sqr_n_mul_mont_383,@function
+.align	32
+sqr_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+.Loop_sqr_383:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1,%edx
+	addq	48(%rsi),%r14
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	leaq	0(%rdi),%rsi
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	decl	%edx
+	jnz	.Loop_sqr_383
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
+.type	__mulq_mont_383_nonred,@function
+.align	32
+__mulq_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rbp
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%r15
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%r15
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rcx)
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	%rdx,%r14
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rsi)
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rcx)
+	addq	%rax,%r8
+	movq	%r9,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rcx)
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	%rdx,%r15
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rcx)
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	%rdx,%r8
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rcx)
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	%rdx,%r9
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rcx)
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	%rdx,%r10
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r13,%r12
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	24(%rcx)
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	adcq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulq_mont_383_nonred,.-__mulq_mont_383_nonred
+.globl	sqr_mont_382x
+.hidden	sqr_mont_382x
+.type	sqr_mont_382x,@function
+.align	32
+sqr_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rsi,16(%rsp)
+	movq	%rdi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	24(%rsp),%rdi
+	call	__mulq_mont_383_nonred
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_383_nonred
+	movq	32+96(%rsp),%rsi
+	movq	32+0(%rsp),%r12
+	movq	32+8(%rsp),%r13
+	andq	%rsi,%r12
+	movq	32+16(%rsp),%rax
+	andq	%rsi,%r13
+	movq	32+24(%rsp),%rbx
+	andq	%rsi,%rax
+	movq	32+32(%rsp),%rbp
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	32+40(%rsp),%rsi
+
+	subq	%r12,%r14
+	movq	0(%rcx),%r12
+	sbbq	%r13,%r15
+	movq	8(%rcx),%r13
+	sbbq	%rax,%r8
+	movq	16(%rcx),%rax
+	sbbq	%rbx,%r9
+	movq	24(%rcx),%rbx
+	sbbq	%rbp,%r10
+	movq	32(%rcx),%rbp
+	sbbq	%rsi,%r11
+	sbbq	%rsi,%rsi
+
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r12,%r14
+	adcq	%r13,%r15
+	adcq	%rax,%r8
+	adcq	%rbx,%r9
+	adcq	%rbp,%r10
+	adcq	%rsi,%r11
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqr_mont_382x,.-sqr_mont_382x
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
new file mode 100644
index 00000000000..20a02073246
--- /dev/null
+++ b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
@@ -0,0 +1,627 @@
+.text	
+
+.globl	mulx_mont_sparse_256
+.hidden	mulx_mont_sparse_256
+.type	mulx_mont_sparse_256,@function
+.align	32
+mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
+
+.globl	sqrx_mont_sparse_256
+.hidden	sqrx_mont_sparse_256
+.type	sqrx_mont_sparse_256,@function
+.align	32
+sqrx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	movq	%rcx,%r8
+	movq	%rdx,%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
+.type	__mulx_mont_sparse_256,@function
+.align	32
+__mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulxq	%r15,%r15,%r12
+	mulxq	%rbp,%rbp,%r13
+	addq	%r15,%r11
+	mulxq	%r9,%r9,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rbp,%r12
+	adcq	%r9,%r13
+	adcq	$0,%r14
+
+	movq	%rax,%r10
+	imulq	%r8,%rax
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r11
+	adcxq	%r9,%r12
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r14
+	adcxq	%r15,%r9
+	adoxq	%r9,%r15
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r10
+	adoxq	%r11,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r12
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r12
+	adoxq	%r9,%r13
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	16(%rbx),%rdx
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+	adcxq	%r10,%r14
+	adoxq	%r10,%r15
+	adcxq	%r10,%r15
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+	movq	%rax,%r11
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r15
+	adcxq	%r10,%r9
+	adoxq	%r9,%r10
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r11
+	adoxq	%r12,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r13
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	24(%rbx),%rdx
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+	adcxq	%r11,%r15
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	adoxq	%r11,%r11
+	adcq	$0,%r11
+	movq	%rax,%r12
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r10
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r10
+	adcxq	%r11,%r9
+	adoxq	%r9,%r11
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r12
+	adoxq	%r13,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r14
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%rax,%rdx
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	adoxq	%r12,%r12
+	adcq	$0,%r12
+	imulq	%r8,%rdx
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rcx),%r13,%r9
+	adcxq	%rax,%r13
+	adoxq	%r9,%r14
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%r14,%rdx
+	leaq	128(%rcx),%rcx
+	adcxq	%rbp,%r10
+	adoxq	%r9,%r11
+	movq	%r15,%rax
+	adcxq	%r13,%r11
+	adoxq	%r13,%r12
+	adcq	$0,%r12
+
+
+
+
+	movq	%r10,%rbp
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	sbbq	16(%rcx),%r10
+	movq	%r11,%r9
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rdx,%r14
+	cmovcq	%rax,%r15
+	cmovcq	%rbp,%r10
+	movq	%r14,0(%rdi)
+	cmovcq	%r9,%r11
+	movq	%r15,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
+.globl	fromx_mont_256
+.hidden	fromx_mont_256
+.type	fromx_mont_256,@function
+.align	32
+fromx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	movq	%r15,%rdx
+	movq	%r10,%r12
+	movq	%r11,%r13
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	fromx_mont_256,.-fromx_mont_256
+
+.globl	redcx_mont_256
+.hidden	redcx_mont_256
+.type	redcx_mont_256,@function
+.align	32
+redcx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+	addq	32(%rsi),%r14
+	adcq	40(%rsi),%r15
+	movq	%r14,%rax
+	adcq	48(%rsi),%r10
+	movq	%r15,%rdx
+	adcq	56(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r10,%r12
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	movq	%r11,%r13
+	sbbq	24(%rbx),%r11
+	sbbq	$0,%rsi
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redcx_mont_256,.-redcx_mont_256
+.type	__mulx_by_1_mont_256,@function
+.align	32
+__mulx_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
new file mode 100644
index 00000000000..9f9f7404ee4
--- /dev/null
+++ b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
@@ -0,0 +1,2968 @@
+.text	
+
+
+
+
+
+
+
+.type	__sub_mod_384x384,@function
+.align	32
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384x384,.-__sub_mod_384x384
+
+.type	__add_mod_384,@function
+.align	32
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__add_mod_384,.-__add_mod_384
+
+.type	__sub_mod_384,@function
+.align	32
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sub_mod_384,.-__sub_mod_384
+.globl	mulx_mont_384x
+.hidden	mulx_mont_384x
+.type	mulx_mont_384x,@function
+.align	32
+mulx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulx_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	128+48(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	(%rbx),%rsi
+	leaq	-48(%rbx),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulx_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	leaq	(%rcx),%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_384x,.-mulx_mont_384x
+.globl	sqrx_mont_384x
+.hidden	sqrx_mont_384x
+.type	sqrx_mont_384x,@function
+.align	32
+sqrx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	movq	%rdx,%r8
+	adcq	%r12,%r12
+	movq	%r15,%r9
+	adcq	%rdi,%rdi
+	movq	%rax,%r10
+	adcq	%rbp,%rbp
+	movq	%r12,%r11
+	sbbq	%rsi,%rsi
+
+	subq	0(%rcx),%rdx
+	sbbq	8(%rcx),%r15
+	movq	%rdi,%r13
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r12
+	sbbq	32(%rcx),%rdi
+	movq	%rbp,%r14
+	sbbq	40(%rcx),%rbp
+	sbbq	$0,%rsi
+
+	cmovcq	%r8,%rdx
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%rdx,48(%rbx)
+	cmovcq	%r11,%r12
+	movq	%r15,56(%rbx)
+	cmovcq	%r13,%rdi
+	movq	%rax,64(%rbx)
+	cmovcq	%r14,%rbp
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_384x,.-sqrx_mont_384x
+
+.globl	mulx_382x
+.hidden	mulx_382x
+.type	mulx_382x,@function
+.align	32
+mulx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulx_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulx_384
+
+
+	leaq	48+128(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulx_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_382x,.-mulx_382x
+.globl	sqrx_382x
+.hidden	sqrx_382x
+.type	sqrx_382x,@function
+.align	32
+sqrx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_382x,.-sqrx_382x
+.globl	mulx_384
+.hidden	mulx_384
+.type	mulx_384,@function
+.align	32
+mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	movq	%rdx,%rbx
+	call	__mulx_384
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_384,.-mulx_384
+
+.type	__mulx_384,@function
+.align	32
+__mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%r14,%r9,%rcx
+	xorq	%rbp,%rbp
+
+	mulxq	%r15,%r8,%rax
+	adcxq	%rcx,%r8
+	movq	%r9,0(%rdi)
+
+	mulxq	%r10,%r9,%rcx
+	adcxq	%rax,%r9
+
+	mulxq	%r11,%r10,%rax
+	adcxq	%rcx,%r10
+
+	mulxq	%r12,%r11,%rcx
+	adcxq	%rax,%r11
+
+	mulxq	%r13,%r12,%r13
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,8(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	16(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,16(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	24(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,24(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	32(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,32(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	40(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,40(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	%rax,%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	movq	%r10,64(%rdi)
+	movq	%r11,72(%rdi)
+	movq	%r12,80(%rdi)
+	movq	%r13,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_384,.-__mulx_384
+.globl	sqrx_384
+.hidden	sqrx_384
+.type	sqrx_384,@function
+.align	32
+sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrx_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_384,.-sqrx_384
+.type	__sqrx_384,@function
+.align	32
+__sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rbx
+
+
+	mulxq	%r14,%r8,%rdi
+	movq	40(%rsi),%rbp
+	mulxq	%r15,%r9,%rax
+	addq	%rdi,%r9
+	mulxq	%rcx,%r10,%rdi
+	adcq	%rax,%r10
+	mulxq	%rbx,%r11,%rax
+	adcq	%rdi,%r11
+	mulxq	%rbp,%r12,%r13
+	movq	%r14,%rdx
+	adcq	%rax,%r12
+	adcq	$0,%r13
+
+
+	xorq	%r14,%r14
+	mulxq	%r15,%rdi,%rax
+	adcxq	%rdi,%r10
+	adoxq	%rax,%r11
+
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r11
+	adoxq	%rax,%r12
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%r15,%rdx
+	adcxq	%rdi,%r13
+	adoxq	%r14,%rax
+	adcxq	%rax,%r14
+
+
+	xorq	%r15,%r15
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r13
+	adoxq	%rax,%r14
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rcx,%rdx
+	adcxq	%rdi,%r14
+	adoxq	%r15,%rax
+	adcxq	%rax,%r15
+
+
+	xorq	%rcx,%rcx
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r14
+	adoxq	%rax,%r15
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rbx,%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rcx,%rax
+	adcxq	%rax,%rcx
+
+
+	mulxq	%rbp,%rdi,%rbx
+	movq	0(%rsi),%rdx
+	addq	%rdi,%rcx
+	movq	8(%rsp),%rdi
+	adcq	$0,%rbx
+
+
+	xorq	%rbp,%rbp
+	adcxq	%r8,%r8
+	adcxq	%r9,%r9
+	adcxq	%r10,%r10
+	adcxq	%r11,%r11
+	adcxq	%r12,%r12
+
+
+	mulxq	%rdx,%rdx,%rax
+	movq	%rdx,0(%rdi)
+	movq	8(%rsi),%rdx
+	adoxq	%rax,%r8
+	movq	%r8,8(%rdi)
+
+	mulxq	%rdx,%r8,%rax
+	movq	16(%rsi),%rdx
+	adoxq	%r8,%r9
+	adoxq	%rax,%r10
+	movq	%r9,16(%rdi)
+	movq	%r10,24(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	24(%rsi),%rdx
+	adoxq	%r8,%r11
+	adoxq	%r9,%r12
+	adcxq	%r13,%r13
+	adcxq	%r14,%r14
+	movq	%r11,32(%rdi)
+	movq	%r12,40(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	32(%rsi),%rdx
+	adoxq	%r8,%r13
+	adoxq	%r9,%r14
+	adcxq	%r15,%r15
+	adcxq	%rcx,%rcx
+	movq	%r13,48(%rdi)
+	movq	%r14,56(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	40(%rsi),%rdx
+	adoxq	%r8,%r15
+	adoxq	%r9,%rcx
+	adcxq	%rbx,%rbx
+	adcxq	%rbp,%rbp
+	movq	%r15,64(%rdi)
+	movq	%rcx,72(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	adoxq	%r8,%rbx
+	adoxq	%r9,%rbp
+
+	movq	%rbx,80(%rdi)
+	movq	%rbp,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__sqrx_384,.-__sqrx_384
+
+
+
+.globl	redcx_mont_384
+.hidden	redcx_mont_384
+.type	redcx_mont_384,@function
+.align	32
+redcx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	redcx_mont_384,.-redcx_mont_384
+
+
+
+
+.globl	fromx_mont_384
+.hidden	fromx_mont_384
+.type	fromx_mont_384,@function
+.align	32
+fromx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+
+
+
+
+	movq	%r14,%rax
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	fromx_mont_384,.-fromx_mont_384
+.type	__mulx_by_1_mont_384,@function
+.align	32
+__mulx_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	%rcx,%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	imulq	%r8,%rdx
+
+
+	xorq	%r14,%r14
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r13
+	adoxq	%r14,%rbp
+	adcxq	%rbp,%r14
+	imulq	%r9,%rdx
+
+
+	xorq	%r15,%r15
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%r15,%rbp
+	adcxq	%rbp,%r15
+	imulq	%r10,%rdx
+
+
+	xorq	%r8,%r8
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r15
+	adoxq	%r8,%rbp
+	adcxq	%rbp,%r8
+	imulq	%r11,%rdx
+
+
+	xorq	%r9,%r9
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r8
+	adoxq	%r9,%rbp
+	adcxq	%rbp,%r9
+	imulq	%r12,%rdx
+
+
+	xorq	%r10,%r10
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r9
+	adoxq	%r10,%rbp
+	adcxq	%rbp,%r10
+	imulq	%r13,%rdx
+
+
+	xorq	%r11,%r11
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r10
+	adoxq	%r11,%rbp
+	adcxq	%rbp,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
+
+.type	__redc_tail_mont_384,@function
+.align	32
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+
+.globl	sgn0x_pty_mont_384
+.hidden	sgn0x_pty_mont_384
+.type	sgn0x_pty_mont_384,@function
+.align	32
+sgn0x_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0x_pty_mont_384,.-sgn0x_pty_mont_384
+
+.globl	sgn0x_pty_mont_384x
+.hidden	sgn0x_pty_mont_384x
+.type	sgn0x_pty_mont_384x,@function
+.align	32
+sgn0x_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x
+.globl	mulx_mont_384
+.hidden	mulx_mont_384
+.type	mulx_mont_384,@function
+.align	32
+mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+	movq	%r8,(%rsp)
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	mulx_mont_384,.-mulx_mont_384
+.type	__mulx_mont_384,@function
+.align	32
+__mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,16(%rsp)
+	imulq	8(%rsp),%r8
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+	adoxq	%rax,%r15
+	adoxq	%rax,%rax
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%r8,%r14
+	adoxq	%r8,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r9,16(%rsp)
+	imulq	8(%rsp),%r9
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+	adoxq	%r8,%rax
+	adoxq	%r8,%r8
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r15
+	adoxq	%r9,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r10,16(%rsp)
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+	adoxq	%r9,%r8
+	adoxq	%r9,%r9
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r10,%rax
+	adoxq	%r10,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r11,16(%rsp)
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+	adoxq	%r10,%r9
+	adoxq	%r10,%r10
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r11,%r8
+	adoxq	%r11,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	movq	%r12,16(%rsp)
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+	adoxq	%r11,%r10
+	adoxq	%r11,%r11
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r9
+	adoxq	%r12,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	movq	%r15,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	movq	%rax,%rsi
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	movq	%r14,%rdx
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	leaq	128(%rcx),%rcx
+	movq	%r8,%r12
+	adcq	$0,%r11
+
+
+
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r9,%rdi
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r8
+	sbbq	32(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	40(%rcx),%r10
+	sbbq	$0,%r11
+
+	cmovncq	%r14,%rdx
+	cmovcq	%r13,%r15
+	cmovcq	%rsi,%rax
+	cmovncq	%r8,%r12
+	movq	%rdx,0(%rbx)
+	cmovncq	%r9,%rdi
+	movq	%r15,8(%rbx)
+	cmovncq	%r10,%rbp
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulx_mont_384,.-__mulx_mont_384
+.globl	sqrx_mont_384
+.hidden	sqrx_mont_384
+.type	sqrx_mont_384,@function
+.align	32
+sqrx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rcx,%r8
+	leaq	-128(%rdx),%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	leaq	(%rsi),%rbx
+	movq	%r8,(%rsp)
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_384,.-sqrx_mont_384
+
+.globl	sqrx_n_mul_mont_384
+.hidden	sqrx_n_mul_mont_384
+.type	sqrx_n_mul_mont_384,@function
+.align	32
+sqrx_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+
+.Loop_sqrx_384:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_384
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384
+
+.globl	sqrx_n_mul_mont_383
+.hidden	sqrx_n_mul_mont_383
+.type	sqrx_n_mul_mont_383,@function
+.align	32
+sqrx_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+	leaq	-128(%rcx),%rcx
+
+.Loop_sqrx_383:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_383_nonred
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	.Loop_sqrx_383
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383
+.type	__mulx_mont_383_nonred,@function
+.align	32
+__mulx_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	movq	%r8,%rax
+	imulq	8(%rsp),%r8
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%r15,%rbp
+	adoxq	%rbp,%r15
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rax,%r14
+	adoxq	%rax,%r15
+	adcxq	%rax,%r15
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rax,%rbp
+	adoxq	%rbp,%rax
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%r8,%rbp
+	adoxq	%rbp,%r8
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%r9,%rbp
+	adoxq	%rbp,%r9
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%r10,%rbp
+	adoxq	%rbp,%r10
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r14,%rdx
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	adcq	$0,%r10
+	movq	%r8,%r12
+
+	movq	%r14,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r9,%rdi
+	movq	%r8,24(%rbx)
+	movq	%r9,32(%rbx)
+	movq	%r10,40(%rbx)
+	movq	%r10,%rbp
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__mulx_mont_383_nonred,.-__mulx_mont_383_nonred
+.globl	sqrx_mont_382x
+.hidden	sqrx_mont_382x
+.type	sqrx_mont_382x,@function
+.align	32
+sqrx_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	adcq	%r12,%r12
+	adcq	%rdi,%rdi
+	adcq	%rbp,%rbp
+
+	movq	%rdx,48(%rbx)
+	movq	%r15,56(%rbx)
+	movq	%rax,64(%rbx)
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32-128(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+
+
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	movq	32+96(%rsp),%r14
+	leaq	128(%rcx),%rcx
+	movq	32+0(%rsp),%r8
+	andq	%r14,%r8
+	movq	32+8(%rsp),%r9
+	andq	%r14,%r9
+	movq	32+16(%rsp),%r10
+	andq	%r14,%r10
+	movq	32+24(%rsp),%r11
+	andq	%r14,%r11
+	movq	32+32(%rsp),%r13
+	andq	%r14,%r13
+	andq	32+40(%rsp),%r14
+
+	subq	%r8,%rdx
+	movq	0(%rcx),%r8
+	sbbq	%r9,%r15
+	movq	8(%rcx),%r9
+	sbbq	%r10,%rax
+	movq	16(%rcx),%r10
+	sbbq	%r11,%r12
+	movq	24(%rcx),%r11
+	sbbq	%r13,%rdi
+	movq	32(%rcx),%r13
+	sbbq	%r14,%rbp
+	sbbq	%r14,%r14
+
+	andq	%r14,%r8
+	andq	%r14,%r9
+	andq	%r14,%r10
+	andq	%r14,%r11
+	andq	%r14,%r13
+	andq	40(%rcx),%r14
+
+	addq	%r8,%rdx
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%r12
+	adcq	%r13,%rdi
+	adcq	%r14,%rbp
+
+	movq	%rdx,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sqrx_mont_382x,.-sqrx_mont_382x
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S
new file mode 100644
index 00000000000..7341decf4f5
--- /dev/null
+++ b/crypto/blst_src/build/elf/sha256-armv8.S
@@ -0,0 +1,1077 @@
+//
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+// project.
+// ====================================================================
+//
+// sha256_block procedure for ARMv8.
+//
+// This module is stripped of scalar code paths, with raionale that all
+// known processors are NEON-capable.
+//
+// See original module at CRYPTOGAMS for further details.
+
+.text
+
+.align	6
+.type	.LK256,%object
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+.size	.LK256,.-.LK256
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.align	2
+.align	2
+.globl	blst_sha256_block_armv8
+.type	blst_sha256_block_armv8,%function
+.align	6
+blst_sha256_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	blst_sha256_block_armv8,.-blst_sha256_block_armv8
+.globl	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,%function
+.align	4
+blst_sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	x16,.LK256
+	add	x2,x1,x2,lsl#6	// len to point at the end of inp
+
+	ld1	{v0.16b},[x1], #16
+	ld1	{v1.16b},[x1], #16
+	ld1	{v2.16b},[x1], #16
+	ld1	{v3.16b},[x1], #16
+	ld1	{v4.4s},[x16], #16
+	ld1	{v5.4s},[x16], #16
+	ld1	{v6.4s},[x16], #16
+	ld1	{v7.4s},[x16], #16
+	rev32	v0.16b,v0.16b		// yes, even on
+	rev32	v1.16b,v1.16b		// big-endian
+	rev32	v2.16b,v2.16b
+	rev32	v3.16b,v3.16b
+	mov	x17,sp
+	add	v4.4s,v4.4s,v0.4s
+	add	v5.4s,v5.4s,v1.4s
+	add	v6.4s,v6.4s,v2.4s
+	st1	{v4.4s,v5.4s},[x17], #32
+	add	v7.4s,v7.4s,v3.4s
+	st1	{v6.4s,v7.4s},[x17]
+	sub	x17,x17,#32
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#8]
+	ldp	w7,w8,[x0,#16]
+	ldp	w9,w10,[x0,#24]
+	ldr	w12,[sp,#0]
+	mov	w13,wzr
+	eor	w14,w4,w5
+	mov	w15,wzr
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+	ext	v4.16b,v0.16b,v1.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v2.16b,v3.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v3.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v0.4s,v0.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v0.4s,v0.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v0.4s,v0.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v0.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v0.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v0.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v0.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v0.4s,#13
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v0.4s,v0.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v0.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v1.16b,v2.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v3.16b,v0.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v0.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v1.4s,v1.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v1.4s,v1.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v1.4s,v1.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v1.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v1.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v1.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v1.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v1.4s,#13
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v1.4s,v1.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v1.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v2.16b,v3.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v0.16b,v1.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v1.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v2.4s,v2.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v2.4s,v2.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v2.4s,v2.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v2.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v2.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v2.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v2.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v2.4s,#13
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v2.4s,v2.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v2.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v3.16b,v0.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v1.16b,v2.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v2.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v3.4s,v3.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v3.4s,v3.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v3.4s,v3.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v3.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v3.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v3.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v3.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v3.4s,#13
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v3.4s,v3.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v3.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[x16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	cmp	w12,#0				// check for K256 terminator
+	ldr	w12,[sp,#0]
+	sub	x17,x17,#64
+	bne	.L_00_48
+
+	sub	x16,x16,#256		// rewind x16
+	cmp	x1,x2
+	mov	x17, #64
+	csel	x17, x17, xzr, eq
+	sub	x1,x1,x17			// avoid SEGV
+	mov	x17,sp
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v0.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v0.16b,v0.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v0.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v1.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v1.16b,v1.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v1.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v2.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v2.16b,v2.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v2.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v3.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v3.16b,v3.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v3.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w3,w3,w15			// h+=Sigma0(a) from the past
+	ldp	w11,w12,[x0,#0]
+	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
+	ldp	w13,w14,[x0,#8]
+	add	w3,w3,w11			// accumulate
+	add	w4,w4,w12
+	ldp	w11,w12,[x0,#16]
+	add	w5,w5,w13
+	add	w6,w6,w14
+	ldp	w13,w14,[x0,#24]
+	add	w7,w7,w11
+	add	w8,w8,w12
+	ldr	w12,[sp,#0]
+	stp	w3,w4,[x0,#0]
+	add	w9,w9,w13
+	mov	w13,wzr
+	stp	w5,w6,[x0,#8]
+	add	w10,w10,w14
+	stp	w7,w8,[x0,#16]
+	eor	w14,w4,w5
+	stp	w9,w10,[x0,#24]
+	mov	w15,wzr
+	mov	x17,sp
+	b.ne	.L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,%function
+.align	4
+blst_sha256_emit:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[x0,#4]
+	lsr	x4,x4,#32
+	str	w5,[x0,#12]
+	lsr	x5,x5,#32
+	str	w6,[x0,#20]
+	lsr	x6,x6,#32
+	str	w7,[x0,#28]
+	lsr	x7,x7,#32
+	str	w4,[x0,#0]
+	str	w5,[x0,#8]
+	str	w6,[x0,#16]
+	str	w7,[x0,#24]
+	ret
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,%function
+.align	4
+blst_sha256_bcopy:
+.Loop_bcopy:
+	ldrb	w3,[x1],#1
+	sub	x2,x2,#1
+	strb	w3,[x0],#1
+	cbnz	x2,.Loop_bcopy
+	ret
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,%function
+.align	4
+blst_sha256_hcopy:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+	ret
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s
new file mode 100644
index 00000000000..20b5c411306
--- /dev/null
+++ b/crypto/blst_src/build/elf/sha256-portable-x86_64.s
@@ -0,0 +1,1754 @@
+.text	
+
+.globl	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,@function
+.align	16
+blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	16*4+3*8
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	0(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	4(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	8(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	12(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	16(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	20(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	24(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	28(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	32(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	36(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	40(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	44(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	48(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	52(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	56(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	60(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	64(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	68(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	72(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	76(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	80(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	84(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	88(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	92(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	96(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	100(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	104(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	108(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	112(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	116(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	120(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	124(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	64(%rbp),%rbp
+	cmpb	$0x19,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	leaq	64+24+48(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	movq	64+24(%rsp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,@function
+.align	16
+blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,@function
+.align	16
+blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+.Loop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,@function
+.align	16
+blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s
new file mode 100644
index 00000000000..47fdc5bc57a
--- /dev/null
+++ b/crypto/blst_src/build/elf/sha256-x86_64.s
@@ -0,0 +1,1446 @@
+.text	
+
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	blst_sha256_block_data_order_shaext
+.hidden	blst_sha256_block_data_order_shaext
+.type	blst_sha256_block_data_order_shaext,@function
+.align	64
+blst_sha256_block_data_order_shaext:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	256-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	16-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	48-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	112-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	144-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	176-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	208-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	240-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext
+.globl	blst_sha256_block_data_order
+.hidden	blst_sha256_block_data_order
+.type	blst_sha256_block_data_order,@function
+.align	64
+blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$40,%rsp
+.cfi_adjust_cfa_offset	40
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,0(%rsp)
+
+	movq	%rdx,16(%rsp)
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+
+
+	leaq	-64(%rsp),%rsp
+	movl	0(%rdi),%eax
+	andq	$-64,%rsp
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+256(%rip),%xmm7
+	movq	%rsi,8(%rbp)
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rsi
+.byte	102,15,56,0,207
+	movdqa	0(%rsi),%xmm4
+	movdqa	16(%rsi),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	32(%rsi),%xmm6
+.byte	102,15,56,0,223
+	movdqa	48(%rsi),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-64,%rsi
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	16(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	32(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	48(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,67(%rsi)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	0(%rbp),%rdi
+	movl	%r14d,%eax
+	movq	8(%rbp),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	leaq	64(%rsi),%rsi
+	cmpq	16(%rbp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	xorps	%xmm0,%xmm0
+	leaq	40+48(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0,0(%rsp)
+	movaps	%xmm0,16(%rsp)
+	movaps	%xmm0,32(%rsp)
+	movaps	%xmm0,48(%rsp)
+	movq	40(%rbp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbx
+.cfi_restore	%rbx
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
+.globl	blst_sha256_emit
+.hidden	blst_sha256_emit
+.type	blst_sha256_emit,@function
+.align	16
+blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_emit,.-blst_sha256_emit
+
+.globl	blst_sha256_bcopy
+.hidden	blst_sha256_bcopy
+.type	blst_sha256_bcopy,@function
+.align	16
+blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+.Loop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	.Loop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_bcopy,.-blst_sha256_bcopy
+
+.globl	blst_sha256_hcopy
+.hidden	blst_sha256_hcopy
+.type	blst_sha256_hcopy,@function
+.align	16
+blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	blst_sha256_hcopy,.-blst_sha256_hcopy
+
+.section	.note.GNU-stack,"",@progbits
+.section	.note.gnu.property,"a",@note
+	.long	4,2f-1f,5
+	.byte	0x47,0x4E,0x55,0
+1:	.long	0xc0000002,4,3
+.align	8
+2:
diff --git a/crypto/blst_src/build/mach-o/add_mod_256-armv8.S b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S
new file mode 100644
index 00000000000..198d65aef69
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S
@@ -0,0 +1,379 @@
+.text
+
+.globl	_add_mod_256
+.private_extern	_add_mod_256
+
+.align	5
+_add_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_mul_by_3_mod_256
+.private_extern	_mul_by_3_mod_256
+
+.align	5
+_mul_by_3_mod_256:
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	stp	x8,x9,[x0]
+	csel	x11,x11,x2,lo
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_lshift_mod_256
+.private_extern	_lshift_mod_256
+
+.align	5
+_lshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+Loop_lshift_mod_256:
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x12,lo
+	csel	x9,x9,x13,lo
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+
+	cbnz	x2,Loop_lshift_mod_256
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_rshift_mod_256
+.private_extern	_rshift_mod_256
+
+.align	5
+_rshift_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+Loop_rshift:
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	csel	x12,x12,x8,ne
+	csel	x13,x13,x9,ne
+	csel	x14,x14,x10,ne
+	csel	x15,x15,x11,ne
+	csel	x3,x3,xzr,ne
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,Loop_rshift
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_cneg_mod_256
+.private_extern	_cneg_mod_256
+
+.align	5
+_cneg_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x8,x8,x12,eq
+	csel	x9,x9,x13,eq
+	csel	x10,x10,x14,eq
+	stp	x8,x9,[x0]
+	csel	x11,x11,x15,eq
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_sub_mod_256
+.private_extern	_sub_mod_256
+
+.align	5
+_sub_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+
+
+.globl	_check_mod_256
+.private_extern	_check_mod_256
+
+.align	5
+_check_mod_256:
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	csel	x0,x0,xzr,ne
+	and	x0,x0,x1
+
+	ret
+
+
+.globl	_add_n_check_mod_256
+.private_extern	_add_n_check_mod_256
+
+.align	5
+_add_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csel	x8,x8,x16,lo
+	csel	x9,x9,x17,lo
+	csel	x10,x10,x1,lo
+	csel	x11,x11,x2,lo
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+
+
+.globl	_sub_n_check_mod_256
+.private_extern	_sub_n_check_mod_256
+
+.align	5
+_sub_n_check_mod_256:
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	csel	x0, x17, xzr, ne
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s
new file mode 100644
index 00000000000..19e5ba9834f
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s
@@ -0,0 +1,564 @@
+.text	
+
+.globl	_add_mod_256
+.private_extern	_add_mod_256
+
+.p2align	5
+_add_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+L$oaded_a_add_mod_256:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_mul_by_3_mod_256
+.private_extern	_mul_by_3_mod_256
+
+.p2align	5
+_mul_by_3_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rcx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rsi,%rdx
+	movq	24(%rsi),%r11
+
+	call	__lshift_mod_256
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	jmp	L$oaded_a_add_mod_256
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r8,%rax
+	adcq	%r10,%r10
+	movq	%r9,%rsi
+	adcq	%r11,%r11
+	sbbq	%r12,%r12
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	cmovcq	%rbx,%r10
+	cmovcq	%rbp,%r11
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.globl	_lshift_mod_256
+.private_extern	_lshift_mod_256
+
+.p2align	5
+_lshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+L$oop_lshift_mod_256:
+	call	__lshift_mod_256
+	decl	%edx
+	jnz	L$oop_lshift_mod_256
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_rshift_mod_256
+.private_extern	_rshift_mod_256
+
+.p2align	5
+_rshift_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rbp
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+L$oop_rshift_mod_256:
+	movq	%rbp,%r8
+	andq	$1,%rbp
+	movq	0(%rcx),%rax
+	negq	%rbp
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+
+	andq	%rbp,%rax
+	andq	%rbp,%rsi
+	andq	%rbp,%rbx
+	andq	24(%rcx),%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	adcq	%rbx,%r10
+	adcq	%rbp,%r11
+	sbbq	%rax,%rax
+
+	shrq	$1,%r8
+	movq	%r9,%rbp
+	shrq	$1,%r9
+	movq	%r10,%rbx
+	shrq	$1,%r10
+	movq	%r11,%rsi
+	shrq	$1,%r11
+
+	shlq	$63,%rbp
+	shlq	$63,%rbx
+	orq	%r8,%rbp
+	shlq	$63,%rsi
+	orq	%rbx,%r9
+	shlq	$63,%rax
+	orq	%rsi,%r10
+	orq	%rax,%r11
+
+	decl	%edx
+	jnz	L$oop_rshift_mod_256
+
+	movq	%rbp,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_cneg_mod_256
+.private_extern	_cneg_mod_256
+
+.p2align	5
+_cneg_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	0(%rsi),%r12
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%r12,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%r12
+	orq	%r10,%r12
+	orq	%r11,%r12
+	movq	$-1,%rbp
+
+	movq	0(%rcx),%rax
+	cmovnzq	%rbp,%r12
+	movq	8(%rcx),%rsi
+	movq	16(%rcx),%rbx
+	andq	%r12,%rax
+	movq	24(%rcx),%rbp
+	andq	%r12,%rsi
+	andq	%r12,%rbx
+	andq	%r12,%rbp
+
+	subq	%r8,%rax
+	sbbq	%r9,%rsi
+	sbbq	%r10,%rbx
+	sbbq	%r11,%rbp
+
+	orq	%rdx,%rdx
+
+	cmovzq	%r8,%rax
+	cmovzq	%r9,%rsi
+	movq	%rax,0(%rdi)
+	cmovzq	%r10,%rbx
+	movq	%rsi,8(%rdi)
+	cmovzq	%r11,%rbp
+	movq	%rbx,16(%rdi)
+	movq	%rbp,24(%rdi)
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_sub_mod_256
+.private_extern	_sub_mod_256
+
+.p2align	5
+_sub_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_check_mod_256
+.private_extern	_check_mod_256
+
+.p2align	5
+_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+
+	movq	%rax,%r8
+	orq	%r9,%rax
+	orq	%r10,%rax
+	orq	%r11,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+	movq	$1,%rdx
+	cmpq	$0,%rax
+	cmovneq	%rdx,%rax
+	andq	%rsi,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_add_n_check_mod_256
+.private_extern	_add_n_check_mod_256
+
+.p2align	5
+_add_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	movq	%r8,%rax
+	adcq	16(%rdx),%r10
+	movq	%r9,%rsi
+	adcq	24(%rdx),%r11
+	sbbq	%rdx,%rdx
+
+	movq	%r10,%rbx
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	sbbq	16(%rcx),%r10
+	movq	%r11,%rbp
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%rax,%r8
+	cmovcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_sub_n_check_mod_256
+.private_extern	_sub_n_check_mod_256
+
+.p2align	5
+_sub_n_check_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%rax
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%rsi
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rbx
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbp
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%rax
+	andq	%rdx,%rsi
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+
+	addq	%rax,%r8
+	adcq	%rsi,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rbx,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbp,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	orq	%r9,%r8
+	orq	%r11,%r10
+	orq	%r10,%r8
+	movq	$1,%rax
+	cmovzq	%r8,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
diff --git a/crypto/blst_src/build/mach-o/add_mod_384-armv8.S b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S
new file mode 100644
index 00000000000..a62995f2bed
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S
@@ -0,0 +1,1000 @@
+.text
+
+.globl	_add_mod_384
+.private_extern	_add_mod_384
+
+.align	5
+_add_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+
+.align	5
+__add_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+__add_mod_384_ab_are_loaded:
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+
+
+.globl	_add_mod_384x
+.private_extern	_add_mod_384x
+
+.align	5
+_add_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_rshift_mod_384
+.private_extern	_rshift_mod_384
+
+.align	5
+_rshift_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+Loop_rshift_mod_384:
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,Loop_rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+
+.align	5
+__rshift_mod_384:
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+
+
+.globl	_div_by_2_mod_384
+.private_extern	_div_by_2_mod_384
+
+.align	5
+_div_by_2_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_lshift_mod_384
+.private_extern	_lshift_mod_384
+
+.align	5
+_lshift_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+Loop_lshift_mod_384:
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,Loop_lshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+
+.align	5
+__lshift_mod_384:
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csel	x10,x10,x16,lo
+	csel	x11,x11,x17,lo
+	csel	x12,x12,x19,lo
+	csel	x13,x13,x20,lo
+	csel	x14,x14,x21,lo
+	csel	x15,x15,x22,lo
+
+	ret
+
+
+.globl	_mul_by_3_mod_384
+.private_extern	_mul_by_3_mod_384
+
+.align	5
+_mul_by_3_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_mul_by_8_mod_384
+.private_extern	_mul_by_8_mod_384
+
+.align	5
+_mul_by_8_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_mul_by_3_mod_384x
+.private_extern	_mul_by_3_mod_384x
+
+.align	5
+_mul_by_3_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_mul_by_8_mod_384x
+.private_extern	_mul_by_8_mod_384x
+
+.align	5
+_mul_by_8_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_cneg_mod_384
+.private_extern	_cneg_mod_384
+
+.align	5
+_cneg_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetm	x3,ne
+	ands	x2,x2,x3
+
+	csel	x10,x10,x16,eq
+	csel	x11,x11,x17,eq
+	csel	x12,x12,x19,eq
+	csel	x13,x13,x20,eq
+	stp	x10,x11,[x0]
+	csel	x14,x14,x21,eq
+	stp	x12,x13,[x0,#16]
+	csel	x15,x15,x22,eq
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_sub_mod_384
+.private_extern	_sub_mod_384
+
+.align	5
+_sub_mod_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+
+.align	5
+__sub_mod_384:
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+
+
+.globl	_sub_mod_384x
+.private_extern	_sub_mod_384x
+
+.align	5
+_sub_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_mul_by_1_plus_i_mod_384x
+.private_extern	_mul_by_1_plus_i_mod_384x
+
+.align	5
+_mul_by_1_plus_i_mod_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+
+.globl	_sgn0_pty_mod_384
+.private_extern	_sgn0_pty_mod_384
+
+.align	5
+_sgn0_pty_mod_384:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+
+
+.globl	_sgn0_pty_mod_384x
+.private_extern	_sgn0_pty_mod_384x
+
+.align	5
+_sgn0_pty_mod_384x:
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+
+.globl	_vec_select_32
+.private_extern	_vec_select_32
+
+.align	5
+_vec_select_32:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	_vec_select_48
+.private_extern	_vec_select_48
+
+.align	5
+_vec_select_48:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	_vec_select_96
+.private_extern	_vec_select_96
+
+.align	5
+_vec_select_96:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	_vec_select_192
+.private_extern	_vec_select_192
+
+.align	5
+_vec_select_192:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	_vec_select_144
+.private_extern	_vec_select_144
+
+.align	5
+_vec_select_144:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+
+.globl	_vec_select_288
+.private_extern	_vec_select_288
+
+.align	5
+_vec_select_288:
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+
+.globl	_vec_prefetch
+.private_extern	_vec_prefetch
+
+.align	5
+_vec_prefetch:
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	csel	x2, xzr, x2, hi
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	csel	x0, x1, x0, hi
+	prfm	pldl1keep, [x0]
+	ret
+
+.globl	_vec_is_zero_16x
+.private_extern	_vec_is_zero_16x
+
+.align	5
+_vec_is_zero_16x:
+	ld1	{v0.2d}, [x0], #16
+	lsr	x1, x1, #4
+	sub	x1, x1, #1
+	cbz	x1, Loop_is_zero_done
+
+Loop_is_zero:
+	ld1	{v1.2d}, [x0], #16
+	orr	v0.16b, v0.16b, v1.16b
+	sub	x1, x1, #1
+	cbnz	x1, Loop_is_zero
+
+Loop_is_zero_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+
+.globl	_vec_is_equal_16x
+.private_extern	_vec_is_equal_16x
+
+.align	5
+_vec_is_equal_16x:
+	ld1	{v0.2d}, [x0], #16
+	ld1	{v1.2d}, [x1], #16
+	lsr	x2, x2, #4
+	eor	v0.16b, v0.16b, v1.16b
+
+Loop_is_equal:
+	sub	x2, x2, #1
+	cbz	x2, Loop_is_equal_done
+	ld1	{v1.2d}, [x0], #16
+	ld1	{v2.2d}, [x1], #16
+	eor	v1.16b, v1.16b, v2.16b
+	orr	v0.16b, v0.16b, v1.16b
+	b	Loop_is_equal
+	nop
+
+Loop_is_equal_done:
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	csel	x0, x0, xzr, eq
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s
new file mode 100644
index 00000000000..974978e3425
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s
@@ -0,0 +1,1899 @@
+.text	
+
+.globl	_add_mod_384
+.private_extern	_add_mod_384
+
+.p2align	5
+_add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__add_mod_384_a_is_loaded:
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_add_mod_384x
+.private_extern	_add_mod_384x
+
+.p2align	5
+_add_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_rshift_mod_384
+.private_extern	_rshift_mod_384
+
+.p2align	5
+_rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+L$oop_rshift_mod_384:
+	call	__rshift_mod_384
+	decl	%edx
+	jnz	L$oop_rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__rshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rsi
+	movq	0(%rcx),%r14
+	andq	%r8,%rsi
+	movq	8(%rcx),%r15
+	negq	%rsi
+	movq	16(%rcx),%rax
+	andq	%rsi,%r14
+	movq	24(%rcx),%rbx
+	andq	%rsi,%r15
+	movq	32(%rcx),%rbp
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%rbx
+	adcq	%r12,%rbp
+	adcq	%r13,%rsi
+	sbbq	%r13,%r13
+
+	shrq	$1,%r14
+	movq	%r15,%r8
+	shrq	$1,%r15
+	movq	%rax,%r9
+	shrq	$1,%rax
+	movq	%rbx,%r10
+	shrq	$1,%rbx
+	movq	%rbp,%r11
+	shrq	$1,%rbp
+	movq	%rsi,%r12
+	shrq	$1,%rsi
+	shlq	$63,%r8
+	shlq	$63,%r9
+	orq	%r14,%r8
+	shlq	$63,%r10
+	orq	%r15,%r9
+	shlq	$63,%r11
+	orq	%rax,%r10
+	shlq	$63,%r12
+	orq	%rbx,%r11
+	shlq	$63,%r13
+	orq	%rbp,%r12
+	orq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_div_by_2_mod_384
+.private_extern	_div_by_2_mod_384
+
+.p2align	5
+_div_by_2_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	%rdx,%rcx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	call	__rshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_lshift_mod_384
+.private_extern	_lshift_mod_384
+
+.p2align	5
+_lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+L$oop_lshift_mod_384:
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdi,%rdi
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdi
+
+	movq	(%rsp),%rdi
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	decl	%edx
+	jnz	L$oop_lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__lshift_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	movq	%r8,%r14
+	adcq	%r11,%r11
+	movq	%r9,%r15
+	adcq	%r12,%r12
+	movq	%r10,%rax
+	adcq	%r13,%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rbx,%r11
+	cmovcq	%rbp,%r12
+	cmovcq	%rsi,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.globl	_mul_by_3_mod_384
+.private_extern	_mul_by_3_mod_384
+
+.p2align	5
+_mul_by_3_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_mul_by_8_mod_384
+.private_extern	_mul_by_8_mod_384
+
+.p2align	5
+_mul_by_8_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_mul_by_3_mod_384x
+.private_extern	_mul_by_3_mod_384x
+
+.p2align	5
+_mul_by_3_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	(%rsp),%rsi
+	leaq	48(%rdi),%rdi
+
+	movq	48(%rsi),%r8
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	movq	72(%rsi),%r11
+	movq	80(%rsi),%r12
+	movq	88(%rsi),%r13
+
+	call	__lshift_mod_384
+
+	movq	$48,%rdx
+	addq	(%rsp),%rdx
+	call	__add_mod_384_a_is_loaded
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_mul_by_8_mod_384x
+.private_extern	_mul_by_8_mod_384x
+
+.p2align	5
+_mul_by_8_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	%rdx,%rcx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	(%rsp),%rsi
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	movq	%r8,48+0(%rdi)
+	movq	%r9,48+8(%rdi)
+	movq	%r10,48+16(%rdi)
+	movq	%r11,48+24(%rdi)
+	movq	%r12,48+32(%rdi)
+	movq	%r13,48+40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_cneg_mod_384
+.private_extern	_cneg_mod_384
+
+.p2align	5
+_cneg_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdx
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	%rdx,%r8
+	movq	24(%rsi),%r11
+	orq	%r9,%rdx
+	movq	32(%rsi),%r12
+	orq	%r10,%rdx
+	movq	40(%rsi),%r13
+	orq	%r11,%rdx
+	movq	$-1,%rsi
+	orq	%r12,%rdx
+	orq	%r13,%rdx
+
+	movq	0(%rcx),%r14
+	cmovnzq	%rsi,%rdx
+	movq	8(%rcx),%r15
+	movq	16(%rcx),%rax
+	andq	%rdx,%r14
+	movq	24(%rcx),%rbx
+	andq	%rdx,%r15
+	movq	32(%rcx),%rbp
+	andq	%rdx,%rax
+	movq	40(%rcx),%rsi
+	andq	%rdx,%rbx
+	movq	0(%rsp),%rcx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	subq	%r8,%r14
+	sbbq	%r9,%r15
+	sbbq	%r10,%rax
+	sbbq	%r11,%rbx
+	sbbq	%r12,%rbp
+	sbbq	%r13,%rsi
+
+	orq	%rcx,%rcx
+
+	cmovzq	%r8,%r14
+	cmovzq	%r9,%r15
+	cmovzq	%r10,%rax
+	movq	%r14,0(%rdi)
+	cmovzq	%r11,%rbx
+	movq	%r15,8(%rdi)
+	cmovzq	%r12,%rbp
+	movq	%rax,16(%rdi)
+	cmovzq	%r13,%rsi
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rsi,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.globl	_sub_mod_384
+.private_extern	_sub_mod_384
+
+.p2align	5
+_sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_sub_mod_384x
+.private_extern	_sub_mod_384x
+
+.p2align	5
+_sub_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	24
+
+
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	leaq	48(%rsi),%rsi
+	leaq	48(%rdx),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__sub_mod_384
+
+	movq	24+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	24+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	24+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	24+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	24+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_mul_by_1_plus_i_mod_384x
+.private_extern	_mul_by_1_plus_i_mod_384x
+
+.p2align	5
+_mul_by_1_plus_i_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$56,%rsp
+.cfi_adjust_cfa_offset	56
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rbx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rcx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+	movq	%rdi,48(%rsp)
+	sbbq	%rdi,%rdi
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rbx
+	sbbq	80(%rsi),%rcx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rsi,%rsi
+
+	movq	%r8,0(%rsp)
+	movq	0(%rdx),%r8
+	movq	%r9,8(%rsp)
+	movq	8(%rdx),%r9
+	movq	%r10,16(%rsp)
+	movq	16(%rdx),%r10
+	movq	%r11,24(%rsp)
+	movq	24(%rdx),%r11
+	movq	%r12,32(%rsp)
+	andq	%rsi,%r8
+	movq	32(%rdx),%r12
+	movq	%r13,40(%rsp)
+	andq	%rsi,%r9
+	movq	40(%rdx),%r13
+	andq	%rsi,%r10
+	andq	%rsi,%r11
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	movq	48(%rsp),%rsi
+
+	addq	%r8,%r14
+	movq	0(%rsp),%r8
+	adcq	%r9,%r15
+	movq	8(%rsp),%r9
+	adcq	%r10,%rax
+	movq	16(%rsp),%r10
+	adcq	%r11,%rbx
+	movq	24(%rsp),%r11
+	adcq	%r12,%rcx
+	movq	32(%rsp),%r12
+	adcq	%r13,%rbp
+	movq	40(%rsp),%r13
+
+	movq	%r14,0(%rsi)
+	movq	%r8,%r14
+	movq	%r15,8(%rsi)
+	movq	%rax,16(%rsi)
+	movq	%r9,%r15
+	movq	%rbx,24(%rsi)
+	movq	%rcx,32(%rsi)
+	movq	%r10,%rax
+	movq	%rbp,40(%rsi)
+
+	subq	0(%rdx),%r8
+	movq	%r11,%rbx
+	sbbq	8(%rdx),%r9
+	sbbq	16(%rdx),%r10
+	movq	%r12,%rcx
+	sbbq	24(%rdx),%r11
+	sbbq	32(%rdx),%r12
+	movq	%r13,%rbp
+	sbbq	40(%rdx),%r13
+	sbbq	$0,%rdi
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,48(%rsi)
+	cmovcq	%rbx,%r11
+	movq	%r9,56(%rsi)
+	cmovcq	%rcx,%r12
+	movq	%r10,64(%rsi)
+	cmovcq	%rbp,%r13
+	movq	%r11,72(%rsi)
+	movq	%r12,80(%rsi)
+	movq	%r13,88(%rsi)
+
+	movq	56+0(%rsp),%r15
+.cfi_restore	%r15
+	movq	56+8(%rsp),%r14
+.cfi_restore	%r14
+	movq	56+16(%rsp),%r13
+.cfi_restore	%r13
+	movq	56+24(%rsp),%r12
+.cfi_restore	%r12
+	movq	56+32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	56+40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56+48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sgn0_pty_mod_384
+.private_extern	_sgn0_pty_mod_384
+
+.p2align	5
+_sgn0_pty_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%rcx
+	movq	40(%rdi),%rdx
+
+	xorq	%rax,%rax
+	movq	%r8,%rdi
+	addq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%rdi
+	andq	$2,%rax
+	orq	%rdi,%rax
+
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sgn0_pty_mod_384x
+.private_extern	_sgn0_pty_mod_384x
+
+.p2align	5
+_sgn0_pty_mod_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	48(%rdi),%r8
+	movq	56(%rdi),%r9
+	movq	64(%rdi),%r10
+	movq	72(%rdi),%r11
+	movq	80(%rdi),%rcx
+	movq	88(%rdi),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	leaq	0(%rdi),%rax
+	xorq	%rdi,%rdi
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rdi
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rdi
+
+	movq	%r8,0(%rsp)
+	notq	%rdi
+	andq	$1,%rbp
+	andq	$2,%rdi
+	orq	%rbp,%rdi
+
+	movq	0(%rax),%r8
+	movq	8(%rax),%r9
+	movq	16(%rax),%r10
+	movq	24(%rax),%r11
+	movq	32(%rax),%rcx
+	movq	40(%rax),%rdx
+
+	movq	%r8,%rbx
+	orq	%r9,%r8
+	orq	%r10,%r8
+	orq	%r11,%r8
+	orq	%rcx,%r8
+	orq	%rdx,%r8
+
+	xorq	%rax,%rax
+	movq	%rbx,%rbp
+	addq	%rbx,%rbx
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%rcx,%rcx
+	adcq	%rdx,%rdx
+	adcq	$0,%rax
+
+	subq	0(%rsi),%rbx
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	sbbq	24(%rsi),%r11
+	sbbq	32(%rsi),%rcx
+	sbbq	40(%rsi),%rdx
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%rbx
+
+	notq	%rax
+
+	testq	%r8,%r8
+	cmovzq	%rdi,%rbp
+
+	testq	%rbx,%rbx
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%rbp
+	andq	$2,%rax
+	orq	%rbp,%rax
+
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_vec_select_32
+.private_extern	_vec_select_32
+
+.p2align	5
+_vec_select_32:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	16(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	16(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-16(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-16(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-16(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_select_48
+.private_extern	_vec_select_48
+
+.p2align	5
+_vec_select_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	24(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	24(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	24(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-24(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-24(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-24(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-24(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-24(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-24(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_select_96
+.private_extern	_vec_select_96
+
+.p2align	5
+_vec_select_96:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	48(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	48(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	48(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-48(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-48(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-48(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-48(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-48(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-48(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-48(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-48(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_select_192
+.private_extern	_vec_select_192
+
+.p2align	5
+_vec_select_192:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	96(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	96(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	96(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-96(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-96(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-96(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-96(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-96(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-96(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-96(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-96(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_select_144
+.private_extern	_vec_select_144
+
+.p2align	5
+_vec_select_144:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	72(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	72(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	72(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-72(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-72(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-72(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-72(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-72(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-72(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-72(%rdi)
+	pand	%xmm4,%xmm0
+	pand	%xmm5,%xmm1
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-72(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_select_288
+.private_extern	_vec_select_288
+
+.p2align	5
+_vec_select_288:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movd	%ecx,%xmm5
+	pxor	%xmm4,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	movdqu	(%rsi),%xmm0
+	leaq	144(%rsi),%rsi
+	pcmpeqd	%xmm4,%xmm5
+	movdqu	(%rdx),%xmm1
+	leaq	144(%rdx),%rdx
+	pcmpeqd	%xmm5,%xmm4
+	leaq	144(%rdi),%rdi
+	pand	%xmm4,%xmm0
+	movdqu	0+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	0+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,0-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	16+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	16+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,16-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	32+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	32+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,32-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	48+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	48+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,48-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	64+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	64+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,64-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	80+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	80+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,80-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	96+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	96+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,96-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	112+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	112+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,112-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	128+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	128+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,128-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	144+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	144+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,144-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	160+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	160+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,160-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	176+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	176+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,176-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	192+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	192+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,192-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	208+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	208+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,208-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	224+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	224+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,224-144(%rdi)
+	pand	%xmm4,%xmm2
+	movdqu	240+16-144(%rsi),%xmm0
+	pand	%xmm5,%xmm3
+	movdqu	240+16-144(%rdx),%xmm1
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,240-144(%rdi)
+	pand	%xmm4,%xmm0
+	movdqu	256+16-144(%rsi),%xmm2
+	pand	%xmm5,%xmm1
+	movdqu	256+16-144(%rdx),%xmm3
+	por	%xmm1,%xmm0
+	movdqu	%xmm0,256-144(%rdi)
+	pand	%xmm4,%xmm2
+	pand	%xmm5,%xmm3
+	por	%xmm3,%xmm2
+	movdqu	%xmm2,272-144(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_prefetch
+.private_extern	_vec_prefetch
+
+.p2align	5
+_vec_prefetch:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	leaq	-1(%rdi,%rsi,1),%rsi
+	movq	$64,%rax
+	xorq	%r8,%r8
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	cmovaq	%r8,%rax
+	prefetchnta	(%rdi)
+	leaq	(%rdi,%rax,1),%rdi
+	cmpq	%rsi,%rdi
+	cmovaq	%rsi,%rdi
+	prefetchnta	(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_is_zero_16x
+.private_extern	_vec_is_zero_16x
+
+.p2align	5
+_vec_is_zero_16x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%esi
+	movdqu	(%rdi),%xmm0
+	leaq	16(%rdi),%rdi
+
+L$oop_is_zero:
+	decl	%esi
+	jz	L$oop_is_zero_done
+	movdqu	(%rdi),%xmm1
+	leaq	16(%rdi),%rdi
+	por	%xmm1,%xmm0
+	jmp	L$oop_is_zero
+
+L$oop_is_zero_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%esi
+	testq	%rax,%rax
+	cmovnzl	%esi,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_vec_is_equal_16x
+.private_extern	_vec_is_equal_16x
+
+.p2align	5
+_vec_is_equal_16x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	shrl	$4,%edx
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm1
+	subq	%rdi,%rsi
+	leaq	16(%rdi),%rdi
+	pxor	%xmm1,%xmm0
+
+L$oop_is_equal:
+	decl	%edx
+	jz	L$oop_is_equal_done
+	movdqu	(%rdi),%xmm1
+	movdqu	(%rdi,%rsi,1),%xmm2
+	leaq	16(%rdi),%rdi
+	pxor	%xmm2,%xmm1
+	por	%xmm1,%xmm0
+	jmp	L$oop_is_equal
+
+L$oop_is_equal_done:
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+.byte	102,72,15,126,192
+	incl	%edx
+	testq	%rax,%rax
+	cmovnzl	%edx,%eax
+	xorl	$1,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s
new file mode 100644
index 00000000000..2dc58f81608
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s
@@ -0,0 +1,244 @@
+.text	
+
+
+.p2align	5
+__add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	addq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	adcq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	adcq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	adcq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	adcq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	adcq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	adcq	48(%rdx),%r14
+	movq	%r9,8(%rdi)
+	adcq	56(%rdx),%r15
+	movq	%r10,16(%rdi)
+	adcq	64(%rdx),%rax
+	movq	%r12,32(%rdi)
+	movq	%r14,%r8
+	adcq	72(%rdx),%rbx
+	movq	%r11,24(%rdi)
+	movq	%r15,%r9
+	adcq	80(%rdx),%rbp
+	movq	%r13,40(%rdi)
+	movq	%rax,%r10
+	adcq	88(%rdx),%rsi
+	movq	%rbx,%r11
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%rbp,%r12
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%rbx
+	sbbq	32(%rcx),%rbp
+	movq	%rsi,%r13
+	sbbq	40(%rcx),%rsi
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r14
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%r14,48(%rdi)
+	cmovcq	%r11,%rbx
+	movq	%r15,56(%rdi)
+	cmovcq	%r12,%rbp
+	movq	%rax,64(%rdi)
+	cmovcq	%r13,%rsi
+	movq	%rbx,72(%rdi)
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_add_mod_384x384
+.private_extern	_add_mod_384x384
+
+.p2align	5
+_add_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__add_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sub_mod_384x384
+.private_extern	_sub_mod_384x384
+
+.p2align	5
+_sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sub_mod_384x384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
new file mode 100644
index 00000000000..f3a2c3b5f11
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
@@ -0,0 +1,784 @@
+.text
+
+.globl	_ct_inverse_mod_256
+
+.align	5
+_ct_inverse_mod_256:
+.long	3573752639
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	Lab_approximation_31_256_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+.long	3573752767
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+
+.align	5
+__smul_256x63:
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	csel	x22, x22, xzr, ne
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	csel	x23, x23, xzr, ne
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+
+
+
+.align	5
+__smul_512x63_tail:
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+
+
+
+.align	5
+__smul_256_n_shift_by_31:
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+
+
+.align	4
+__ab_approximation_31_256:
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+Lab_approximation_31_256_loaded:
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x5, ne
+	orr	x19, x7, x11	// and ones before top-most, ...
+	csel	x10, x10, x9, ne
+
+	cmp	x19, #0
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	csel	x6, x6, x4, ne
+	orr	x19, x7, x11	// and one more, ...
+	csel	x10, x10, x8, ne
+
+	clz	x19, x19
+	cmp	x19, #64
+	csel	x19, x19, xzr, ne
+	csel	x7, x7, x6, ne
+	csel	x11, x11, x10, ne
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+
+
+
+.align	4
+__inner_loop_31_256:
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+Loop_31_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
+	csel	x13, x13, x19,   hs
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, Loop_31_256
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+
+
+
+.align	4
+__inner_loop_62_256:
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+Loop_62_256:
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	csel	x11, x11, x7, hs	// |b_| = |a_|
+	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	mov	x20, x13
+	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
+	csel	x14, x14, x19,     hs
+	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
+	csel	x15, x15, x20,     hs
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, Loop_62_256
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
new file mode 100644
index 00000000000..b6441da6e1f
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
@@ -0,0 +1,1177 @@
+.text	
+
+.globl	_ct_inverse_mod_256
+
+.p2align	5
+_ct_inverse_mod_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1072,%rsp
+.cfi_adjust_cfa_offset	1072
+
+
+	leaq	48+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+
+	movq	0(%rdx),%r12
+	movq	8(%rdx),%r13
+	movq	16(%rdx),%r14
+	movq	24(%rdx),%r15
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rax,%rsi
+
+
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,64(%rdi)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+	movq	%rdx,72(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+
+
+
+	movq	64(%rsi),%r8
+	movq	104(%rsi),%r12
+	movq	%r8,%r9
+	imulq	0(%rsp),%r8
+	movq	%r12,%r13
+	imulq	8(%rsp),%r12
+	addq	%r12,%r8
+	movq	%r8,32(%rdi)
+	sarq	$63,%r8
+	movq	%r8,40(%rdi)
+	movq	%r8,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r8,64(%rdi)
+	leaq	64(%rsi),%rsi
+
+	imulq	%rdx,%r9
+	imulq	%rcx,%r13
+	addq	%r13,%r9
+	movq	%r9,72(%rdi)
+	sarq	$63,%r9
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	movq	%r9,96(%rdi)
+	movq	%r9,104(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_256x63
+	sarq	$63,%rbp
+	movq	%rbp,40(%rdi)
+	movq	%rbp,48(%rdi)
+	movq	%rbp,56(%rdi)
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+	xorq	$256+64,%rsi
+	movl	$31,%edx
+	call	__ab_approximation_31_256
+
+
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,0(%rsp)
+	movq	%rcx,8(%rsp)
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	32(%rdi),%rdi
+	call	__smulq_256_n_shift_by_31
+	movq	%rdx,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	64(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	call	__smulq_256x63
+
+	movq	16(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	leaq	40(%rdi),%rdi
+	call	__smulq_512x63
+
+	xorq	$256+64,%rsi
+	movl	$47,%edx
+
+	movq	0(%rsi),%r8
+
+	movq	32(%rsi),%r10
+
+	call	__inner_loop_62_256
+
+
+
+
+
+
+
+	leaq	64(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_512x63
+	adcq	%rbp,%rdx
+
+	movq	40(%rsp),%rsi
+	movq	%rdx,%rax
+	sarq	$63,%rdx
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%rax
+
+	movq	%rax,%rdx
+	negq	%rax
+	orq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rdx,%r8
+	movq	%rdx,%r9
+	andq	0(%rsi),%r8
+	movq	%rdx,%r10
+	andq	8(%rsi),%r9
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%rdx
+
+	xorq	%rax,%r8
+	xorq	%rcx,%rcx
+	xorq	%rax,%r9
+	subq	%rax,%rcx
+	xorq	%rax,%r10
+	xorq	%rax,%rdx
+	addq	%rcx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%rdx
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	1072(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1072-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__smulq_512x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r9,8(%rdi)
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r10,16(%rdi)
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%r11,24(%rdi)
+
+	movq	40(%rsi),%r8
+	movq	48(%rsi),%r9
+	movq	56(%rsi),%r10
+	movq	64(%rsi),%r11
+	movq	72(%rsi),%r12
+	movq	80(%rsi),%r13
+	movq	88(%rsi),%r14
+	movq	96(%rsi),%r15
+
+	movq	%rcx,%rdx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rcx
+	addq	%rax,%rcx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rcx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rcx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rcx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rcx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rcx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	imulq	%rcx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	%rbx,%r12
+	adcq	%rbp,%r13
+	adcq	%rbp,%r14
+	adcq	%rbp,%r15
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__smulq_256x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%rbp
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%rbp
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%rbp
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	andq	%rbx,%rbp
+	negq	%rbp
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	%rcx,%rdx
+	movq	40+0(%rsi),%r12
+	movq	40+8(%rsi),%r13
+	movq	40+16(%rsi),%r14
+	movq	40+24(%rsi),%r15
+	movq	40+32(%rsi),%rcx
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rcx
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rcx
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	andq	%rbx,%rcx
+	negq	%rcx
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%rbp,32(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulq_256_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,0(%rdi)
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rbp
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+
+	movq	%rbp,%rbx
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	andq	%rbx,%rbp
+	negq	%rbp
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	adcq	%rdx,%rbp
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r14
+	movq	32+24(%rsi),%r15
+
+	movq	%rcx,%rbx
+	sarq	$63,%rcx
+	xorq	%rax,%rax
+	subq	%rcx,%rax
+
+	xorq	%rcx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rcx,%r12
+	xorq	%rcx,%r13
+	xorq	%rcx,%r14
+	xorq	%rcx,%r15
+	addq	%r12,%rax
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+
+	mulq	%rbx
+	movq	%rax,%r12
+	movq	%r13,%rax
+	andq	%rbx,%rcx
+	negq	%rcx
+	movq	%rdx,%r13
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rbx
+	addq	%rax,%r15
+	adcq	%rdx,%rcx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rcx,%rbp
+
+	movq	0(%rdi),%rdx
+	movq	8(%rdi),%rcx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%rbp,%r11
+
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	xorq	%rbp,%rdx
+	xorq	%rbp,%rcx
+	addq	%rax,%rdx
+	addq	%rax,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__ab_approximation_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	24(%rsi),%r9
+	movq	56(%rsi),%r11
+	movq	16(%rsi),%rbx
+	movq	48(%rsi),%rbp
+	movq	8(%rsi),%r8
+	movq	40(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	32(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	notq	%rax
+	andq	%rax,%r9
+	andq	%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__inner_loop_31_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+L$oop_31_256:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edx
+	jnz	L$oop_31_256
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__inner_loop_62_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	%edx,%r15d
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	%rdx,%r13
+	movq	%rdx,%r14
+
+L$oop_62_256:
+	xorq	%rax,%rax
+	testq	%r14,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	%r14,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%r15d
+	jnz	L$oop_62_256
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
new file mode 100644
index 00000000000..c7d9ba8488e
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
@@ -0,0 +1,717 @@
+.text
+
+.globl	_ct_inverse_mod_383
+
+.align	5
+_ct_inverse_mod_383:
+.long	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	Lab_approximation_62_loaded
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+.long	3573752767
+	ret
+
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+
+.align	5
+__smul_383x63:
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+
+
+
+.align	5
+__smul_767x63_tail:
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+
+
+
+.align	5
+__smul_383_n_shift_by_62:
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+
+
+.align	4
+__ab_approximation_62:
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+Lab_approximation_62_loaded:
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x22, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x22, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x22, x8, x14
+	csel	x13, x13, x10, ne
+
+	clz	x22, x22
+	cmp	x22, #64
+	csel	x22, x22, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+
+
+.align	4
+__inner_loop_62:
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+Loop_62:
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x14, x14, x8, hs
+	csel	x3, x26, x24, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x8, x27, x25, hs
+	csel	x15, x15, x17,       hs	// exchange |f0| and |f1|
+	csel	x17, x17, x22,     hs
+	csel	x16, x16, x19,       hs	// exchange |g0| and |g1|
+	csel	x19, x19, x23,     hs
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, Loop_62
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
new file mode 100644
index 00000000000..b5c953d287a
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
@@ -0,0 +1,324 @@
+.text
+
+.globl	_ct_is_square_mod_384
+
+.align	5
+_ct_is_square_mod_384:
+.long	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the Legendre symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	Loop_is_square
+
+.align	4
+Loop_is_square:
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, Loop_is_square
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// and loaded
+	//ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+.long	3573752767
+	ret
+
+
+
+.align	5
+__smul_384_n_shift_by_30:
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+
+
+.align	4
+__ab_approximation_30:
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x6, ne
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	csel	x13, x13, x12, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x5, ne
+	orr	x21, x8, x14	// ... and ones before that ...
+	csel	x13, x13, x11, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x4, ne
+	orr	x21, x8, x14	// and one more, ...
+	csel	x13, x13, x10, ne
+
+	cmp	x21, #0
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	csel	x7, x7, x3, ne
+	orr	x21, x8, x14
+	csel	x13, x13, x9, ne
+
+	clz	x21, x21
+	cmp	x21, #64
+	csel	x21, x21, xzr, ne
+	csel	x8, x8, x7, ne
+	csel	x14, x14, x13, ne
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+
+
+
+.align	4
+__inner_loop_30:
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+Loop_30:
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	csel	x14, x14, x8, hs	// |b_| = |a_|
+	csel	x8, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x20, x20, x17,  hs	// exchange |fg0| and |fg1|
+	csel	x17, x17, x21, hs
+	csel	x2,   x2,   x25, hs
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, Loop_30
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+
+
+.align	4
+__inner_loop_48:
+Loop_48:
+	sbfx	x24, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x3, x9
+	sub	x15, x15, #1
+	and	x21, x9, x24
+	sub	x22, x9, x3		// |b_|-|a_|
+	subs	x23, x3, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	csel	x9, x9, x3, hs	// |b_| = |a_|
+	csel	x3, x23, x22, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
+	csel	x2,   x2,   x25, hs
+	add	x23, x9, #2
+	lsr	x3, x3, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, Loop_48
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
new file mode 100644
index 00000000000..f2823941167
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
@@ -0,0 +1,471 @@
+.text	
+
+.globl	_ct_is_square_mod_384
+
+.p2align	5
+_ct_is_square_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$536,%rsp
+.cfi_adjust_cfa_offset	536
+
+
+	leaq	24+255(%rsp),%rax
+	andq	$-256,%rax
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%rdi
+	movq	%rax,%rsi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rcx,72(%rax)
+	movq	%rdx,80(%rax)
+	movq	%rdi,88(%rax)
+
+	xorq	%rbp,%rbp
+	movl	$24,%ecx
+	jmp	L$oop_is_square
+
+.p2align	5
+L$oop_is_square:
+	movl	%ecx,16(%rsp)
+
+	call	__ab_approximation_30
+	movq	%rax,0(%rsp)
+	movq	%rbx,8(%rsp)
+
+	movq	$128+48,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movq	0(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	leaq	-48(%rdi),%rdi
+	call	__smulq_384_n_shift_by_30
+
+	movl	16(%rsp),%ecx
+	xorq	$128,%rsi
+
+	andq	48(%rdi),%r14
+	shrq	$1,%r14
+	addq	%r14,%rbp
+
+	subl	$1,%ecx
+	jnz	L$oop_is_square
+
+
+
+
+	movq	48(%rsi),%r9
+	call	__inner_loop_48
+
+	movq	$1,%rax
+	andq	%rbp,%rax
+	xorq	$1,%rax
+
+	leaq	536(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-536-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__smulq_384_n_shift_by_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r14
+	andq	%rbx,%r14
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r14
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r14
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbx
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbx
+	addq	%rax,%rbx
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%rdx,%r15
+	andq	%rbx,%r15
+	mulq	%rbx
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	negq	%r15
+	mulq	%rbx
+	addq	%rax,%r13
+	adcq	%rdx,%r15
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%r15,%r14
+
+	shrdq	$30,%r9,%r8
+	shrdq	$30,%r10,%r9
+	shrdq	$30,%r11,%r10
+	shrdq	$30,%r12,%r11
+	shrdq	$30,%r13,%r12
+	shrdq	$30,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbx,%rbx
+	subq	%r14,%rbx
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbx,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__ab_approximation_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	88(%rsi),%rbx
+	movq	80(%rsi),%r15
+	movq	72(%rsi),%r14
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r11,%r12
+	movq	64(%rsi),%r11
+	cmovzq	%r14,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r10,%r12
+	movq	56(%rsi),%r10
+	cmovzq	%r11,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r9,%r12
+	movq	48(%rsi),%r9
+	cmovzq	%r10,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	cmovzq	%r12,%r13
+	cmovzq	%r15,%rbx
+	cmovzq	%r8,%r12
+	cmovzq	%r9,%r15
+
+	movq	%r13,%rax
+	orq	%rbx,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r13
+	cmovzq	%r9,%rbx
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%r12,%r13
+	shldq	%cl,%r15,%rbx
+
+	movq	$0xFFFFFFFF00000000,%rax
+	movl	%r8d,%r8d
+	movl	%r9d,%r9d
+	andq	%rax,%r13
+	andq	%rax,%rbx
+	orq	%r13,%r8
+	orq	%rbx,%r9
+
+	jmp	__inner_loop_30
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__inner_loop_30:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rbx
+	movq	$0x800000007FFFFFFF,%rcx
+	leaq	-1(%rbx),%r15
+	movl	$30,%edi
+
+L$oop_30:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbx,%r12
+	movq	%rcx,%r13
+	movq	%rbp,%r14
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rcx,%rbx
+	cmovbq	%r12,%rcx
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+	subq	%rcx,%rbx
+	addq	%r15,%rbx
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbx
+	cmovzq	%r13,%rcx
+	cmovzq	%r14,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rcx,%rcx
+	leaq	(%rax,%rbp,1),%rbp
+	subq	%r15,%rcx
+
+	subl	$1,%edi
+	jnz	L$oop_30
+
+	shrq	$32,%r15
+	movl	%ebx,%eax
+	shrq	$32,%rbx
+	movl	%ecx,%edx
+	shrq	$32,%rcx
+	subq	%r15,%rax
+	subq	%r15,%rbx
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__inner_loop_48:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movl	$48,%edi
+
+L$oop_48:
+	movq	%r8,%rax
+	andq	%r9,%rax
+	shrq	$1,%rax
+
+	cmpq	%r9,%r8
+	movq	%r8,%r10
+	movq	%r9,%r11
+	leaq	(%rax,%rbp,1),%rax
+	movq	%rbp,%r12
+	cmovbq	%r9,%r8
+	cmovbq	%r10,%r9
+	cmovbq	%rax,%rbp
+
+	subq	%r9,%r8
+
+	testq	$1,%r10
+	cmovzq	%r10,%r8
+	cmovzq	%r11,%r9
+	cmovzq	%r12,%rbp
+
+	leaq	2(%r9),%rax
+	shrq	$1,%r8
+	shrq	$2,%rax
+	addq	%rax,%rbp
+
+	subl	$1,%edi
+	jnz	L$oop_48
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..185a876b87c
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
@@ -0,0 +1,1187 @@
+.text	
+
+.globl	_ct_inverse_mod_383
+
+.p2align	5
+_ct_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+	call	__ab_approximation_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_383_n_shift_by_62
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+	xorq	$256+96,%rsi
+	movl	$62,%edi
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	48(%rsi),%r10
+	movq	56(%rsi),%r11
+	call	__inner_loop_62
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	movq	%r8,0(%rdi)
+	movq	%r10,48(%rdi)
+
+
+
+	leaq	96(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__smulq_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulq_767x63
+
+
+	xorq	$256+96,%rsi
+	movl	$22,%edi
+
+	movq	0(%rsi),%r8
+	xorq	%r9,%r9
+	movq	48(%rsi),%r10
+	xorq	%r11,%r11
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulq_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__smulq_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,0(%rdi)
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	%r9,8(%rdi)
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	movq	%r10,16(%rdi)
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%r11,24(%rdi)
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	movq	%r12,32(%rdi)
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	movq	%rdx,%rsi
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rsi
+	addq	%rax,%rsi
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	xorq	%rdx,%r14
+	xorq	%rdx,%r15
+	xorq	%rdx,%rbx
+	xorq	%rdx,%rbp
+	xorq	%rdx,%rcx
+	xorq	%rdx,%rdi
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulq	%rsi
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rsi
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rsi
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rsi
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rsi
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+	mulq	%rsi
+	addq	%rax,%rbp
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+	movq	8(%rsp),%rdx
+	imulq	%rsi,%rax
+	movq	16(%rsp),%rsi
+	addq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulq_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	48(%rsi),%rsi
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp,%rax
+	addq	%rax,%r13
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulq_383_n_shift_by_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	48(%rsi),%rsi
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rdx
+	xorq	%rax,%rax
+	subq	%rdx,%rax
+
+	xorq	%rdx,%rbp
+	addq	%rax,%rbp
+
+	xorq	%rdx,%r8
+	xorq	%rdx,%r9
+	xorq	%rdx,%r10
+	xorq	%rdx,%r11
+	xorq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	%r8,%rax
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulq	%rbp
+	movq	%rax,%r8
+	movq	%r9,%rax
+	movq	%rdx,%r9
+	mulq	%rbp
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	imulq	%rbp
+	addq	%rax,%r13
+	adcq	$0,%rdx
+
+	leaq	-48(%rsi),%rsi
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$62,%r9,%r8
+	shrdq	$62,%r10,%r9
+	shrdq	$62,%r11,%r10
+	shrdq	$62,%r12,%r11
+	shrdq	$62,%r13,%r12
+	shrdq	$62,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__ab_approximation_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	16(%rsi),%r8
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	8(%rsi),%r8
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+	movq	0(%rsi),%r8
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	jmp	__inner_loop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	3
+.long	0
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+	movq	%rsi,8(%rsp)
+
+L$oop_62:
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	testq	$1,%r8
+	movq	%r10,%rbp
+	movq	%r11,%r14
+	cmovnzq	%r10,%rax
+	cmovnzq	%r11,%rbx
+	subq	%r8,%rbp
+	sbbq	%r9,%r14
+	movq	%r8,%r15
+	movq	%r9,%rsi
+	subq	%rax,%r8
+	sbbq	%rbx,%r9
+	cmovcq	%rbp,%r8
+	cmovcq	%r14,%r9
+	cmovcq	%r15,%r10
+	cmovcq	%rsi,%r11
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrdq	$1,%r9,%r8
+	shrq	$1,%r9
+	testq	$1,%r15
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	L$oop_62
+
+	movq	8(%rsp),%rsi
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
new file mode 100644
index 00000000000..3e05df3a4b3
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
@@ -0,0 +1,1566 @@
+.text	
+
+.globl	_ctx_inverse_mod_383
+
+.p2align	5
+_ctx_inverse_mod_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$1112,%rsp
+.cfi_adjust_cfa_offset	1112
+
+
+	leaq	88+511(%rsp),%rax
+	andq	$-512,%rax
+	movq	%rdi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	0(%rdx),%r14
+	movq	8(%rdx),%r15
+	movq	16(%rdx),%rbx
+	movq	24(%rdx),%rbp
+	movq	32(%rdx),%rsi
+	movq	40(%rdx),%rdi
+
+	movq	%r8,0(%rax)
+	movq	%r9,8(%rax)
+	movq	%r10,16(%rax)
+	movq	%r11,24(%rax)
+	movq	%r12,32(%rax)
+	movq	%r13,40(%rax)
+
+	movq	%r14,48(%rax)
+	movq	%r15,56(%rax)
+	movq	%rbx,64(%rax)
+	movq	%rbp,72(%rax)
+	movq	%rsi,80(%rax)
+	movq	%rax,%rsi
+	movq	%rdi,88(%rax)
+
+
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+	movq	%rdx,96(%rdi)
+
+
+	xorq	$256,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+
+
+
+	movq	96(%rsi),%rax
+	movq	144(%rsi),%r11
+	movq	%rdx,%rbx
+	movq	%rax,%r10
+	imulq	56(%rsp)
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	64(%rsp)
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	sarq	$63,%r9
+	movq	%r9,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r9,80(%rdi)
+	movq	%r9,88(%rdi)
+	leaq	96(%rsi),%rsi
+
+	movq	%r10,%rax
+	imulq	%rbx
+	movq	%rax,%r8
+	movq	%r11,%rax
+	movq	%rdx,%r9
+	imulq	%rcx
+	addq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r8,96(%rdi)
+	movq	%r9,104(%rdi)
+	sarq	$63,%r9
+	movq	%r9,112(%rdi)
+	movq	%r9,120(%rdi)
+	movq	%r9,128(%rdi)
+	movq	%r9,136(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+	sarq	$63,%r13
+	movq	%r13,48(%rdi)
+	movq	%r13,56(%rdi)
+	movq	%r13,64(%rdi)
+	movq	%r13,72(%rdi)
+	movq	%r13,80(%rdi)
+	movq	%r13,88(%rdi)
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_383_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+	xorq	$256+96,%rsi
+	movl	$31,%edi
+	call	__ab_approximation_31
+
+
+	movq	%r12,72(%rsp)
+	movq	%r13,80(%rsp)
+
+	movq	$256,%rdi
+	xorq	%rsi,%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,56(%rsp)
+	movq	%rcx,64(%rsp)
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_191_n_shift_by_31
+	movq	%rdx,72(%rsp)
+	movq	%rcx,80(%rsp)
+
+	movq	56(%rsp),%rdx
+	movq	64(%rsp),%rcx
+	leaq	96(%rsi),%rsi
+	leaq	48(%rdi),%rdi
+	call	__smulx_383x63
+
+	movq	72(%rsp),%rdx
+	movq	80(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__smulx_767x63
+
+	xorq	$256+96,%rsi
+	movl	$53,%edi
+
+	movq	0(%rsi),%r8
+
+	movq	48(%rsi),%r10
+
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	leaq	96(%rsi),%rsi
+
+
+
+
+
+	movq	%r12,%rdx
+	movq	%r13,%rcx
+	movq	32(%rsp),%rdi
+	call	__smulx_767x63
+
+	movq	40(%rsp),%rsi
+	movq	%rax,%rdx
+	sarq	$63,%rax
+
+	movq	%rax,%r8
+	movq	%rax,%r9
+	movq	%rax,%r10
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	movq	%rax,%r11
+	andq	16(%rsi),%r10
+	andq	24(%rsi),%r11
+	movq	%rax,%r12
+	andq	32(%rsi),%r12
+	andq	40(%rsi),%rax
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	adcq	%r10,%rbx
+	adcq	%r11,%rbp
+	adcq	%r12,%rcx
+	adcq	%rax,%rdx
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%rbx,64(%rdi)
+	movq	%rbp,72(%rdi)
+	movq	%rcx,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	leaq	1112(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-1112-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__smulx_767x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+	leaq	48(%rsi),%rsi
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	%rdx,48(%rdi)
+	sarq	$63,%rdx
+	movq	%rdx,56(%rdi)
+	movq	%rcx,%rdx
+	movq	%rcx,%rax
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+	movq	56(%rsi),%r15
+	movq	64(%rsi),%rbx
+	movq	72(%rsi),%rbp
+	movq	80(%rsi),%rcx
+	movq	88(%rsi),%rdi
+
+	sarq	$63,%rax
+	xorq	%rsi,%rsi
+	subq	%rax,%rsi
+
+	xorq	%rax,%rdx
+	addq	%rsi,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%rax,%r13
+	xorq	%rax,%r14
+	xorq	%rax,%r15
+	xorq	%rax,%rbx
+	xorq	%rax,%rbp
+	xorq	%rax,%rcx
+	xorq	%rax,%rdi
+	addq	%rsi,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	adcq	$0,%rcx
+	adcq	$0,%rdi
+
+	mulxq	%r8,%r8,%rax
+	mulxq	%r9,%r9,%rsi
+	addq	%rax,%r9
+	mulxq	%r10,%r10,%rax
+	adcq	%rsi,%r10
+	mulxq	%r11,%r11,%rsi
+	adcq	%rax,%r11
+	mulxq	%r12,%r12,%rax
+	adcq	%rsi,%r12
+	mulxq	%r13,%r13,%rsi
+	adcq	%rax,%r13
+	mulxq	%r14,%r14,%rax
+	adcq	%rsi,%r14
+	mulxq	%r15,%r15,%rsi
+	adcq	%rax,%r15
+	mulxq	%rbx,%rbx,%rax
+	adcq	%rsi,%rbx
+	mulxq	%rbp,%rbp,%rsi
+	adcq	%rax,%rbp
+	mulxq	%rcx,%rcx,%rax
+	adcq	%rsi,%rcx
+	mulxq	%rdi,%rdi,%rsi
+	movq	8(%rsp),%rdx
+	movq	16(%rsp),%rsi
+	adcq	%rdi,%rax
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	32(%rdx),%r12
+	adcq	40(%rdx),%r13
+	adcq	48(%rdx),%r14
+	movq	56(%rdx),%rdi
+	adcq	%rdi,%r15
+	adcq	%rdi,%rbx
+	adcq	%rdi,%rbp
+	adcq	%rdi,%rcx
+	adcq	%rdi,%rax
+
+	movq	%rdx,%rdi
+
+	movq	%r8,0(%rdx)
+	movq	%r9,8(%rdx)
+	movq	%r10,16(%rdx)
+	movq	%r11,24(%rdx)
+	movq	%r12,32(%rdx)
+	movq	%r13,40(%rdx)
+	movq	%r14,48(%rdx)
+	movq	%r15,56(%rdx)
+	movq	%rbx,64(%rdx)
+	movq	%rbp,72(%rdx)
+	movq	%rcx,80(%rdx)
+	movq	%rax,88(%rdx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulx_383x63:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	movq	%rcx,%rdx
+	adcq	%rbp,%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rbp
+	sarq	$63,%rbp
+	xorq	%rax,%rax
+	subq	%rbp,%rax
+
+	xorq	%rbp,%rdx
+	addq	%rax,%rdx
+
+	xorq	%rbp,%r8
+	xorq	%rbp,%r9
+	xorq	%rbp,%r10
+	xorq	%rbp,%r11
+	xorq	%rbp,%r12
+	xorq	%rbp,%r13
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%rax
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%rax,%r10
+	mulxq	%r11,%r11,%rax
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%rax,%r12
+	mulxq	%r13,%r13,%rax
+	adcq	%rbp,%r13
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulx_383_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	xorq	%r14,%r14
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+	movq	0+24(%rsi),%r11
+	movq	0+32(%rsi),%r12
+	movq	0+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	%rdx,%r14
+
+	movq	%rcx,%rdx
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+	movq	48+0(%rsi),%r8
+	movq	48+8(%rsi),%r9
+	movq	48+16(%rsi),%r10
+	movq	48+24(%rsi),%r11
+	movq	48+32(%rsi),%r12
+	movq	48+40(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%rax,%r10
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r13
+	addq	%rbp,%r9
+	mulxq	%r10,%r10,%rbp
+	adcq	%r13,%r10
+	mulxq	%r11,%r11,%r13
+	adcq	%rbp,%r11
+	mulxq	%r12,%r12,%rbp
+	adcq	%r13,%r12
+	adcq	$0,%rbp
+	imulq	%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%rax
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r9,%r8
+	shrdq	$31,%r10,%r9
+	shrdq	$31,%r11,%r10
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%rax,%r12
+	shrdq	$31,%r14,%rax
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r8
+	xorq	%r14,%r9
+	xorq	%r14,%r10
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%rax,40(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__smulx_191_n_shift_by_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rbx
+	movq	0+0(%rsi),%r8
+	movq	0+8(%rsi),%r9
+	movq	0+16(%rsi),%r10
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r8
+	xorq	%rax,%r9
+	xorq	%r10,%rax
+	addq	%rbp,%r8
+	adcq	$0,%r9
+	adcq	$0,%rax
+
+	mulxq	%r8,%r8,%rbp
+	mulxq	%r9,%r9,%r10
+	addq	%rbp,%r9
+	adcq	$0,%r10
+	imulq	%rdx
+	addq	%rax,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+	movq	%rcx,%rdx
+	movq	48+0(%rsi),%r11
+	movq	48+8(%rsi),%r12
+	movq	48+16(%rsi),%r13
+
+	movq	%rdx,%rax
+	sarq	$63,%rax
+	xorq	%rbp,%rbp
+	subq	%rax,%rbp
+
+	xorq	%rax,%rdx
+	addq	%rbp,%rdx
+
+	xorq	%rax,%r11
+	xorq	%rax,%r12
+	xorq	%r13,%rax
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%rax
+
+	mulxq	%r11,%r11,%rbp
+	mulxq	%r12,%r12,%r13
+	addq	%rbp,%r12
+	adcq	$0,%r13
+	imulq	%rdx
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	%r9,%r12
+	adcq	%r10,%r13
+	adcq	%rdx,%r14
+	movq	%rbx,%rdx
+
+	shrdq	$31,%r12,%r11
+	shrdq	$31,%r13,%r12
+	shrdq	$31,%r14,%r13
+
+	sarq	$63,%r14
+	xorq	%rbp,%rbp
+	subq	%r14,%rbp
+
+	xorq	%r14,%r11
+	xorq	%r14,%r12
+	xorq	%r14,%r13
+	addq	%rbp,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+
+	xorq	%r14,%rdx
+	xorq	%r14,%rcx
+	addq	%rbp,%rdx
+	addq	%rbp,%rcx
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__ab_approximation_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	40(%rsi),%r9
+	movq	88(%rsi),%r11
+	movq	32(%rsi),%rbx
+	movq	80(%rsi),%rbp
+	movq	24(%rsi),%r8
+	movq	72(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	16(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	64(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	8(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	56(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	movq	0(%rsi),%r8
+	cmovzq	%r10,%rbp
+	movq	48(%rsi),%r10
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	cmovzq	%rbx,%r9
+	cmovzq	%rbp,%r11
+	cmovzq	%r8,%rbx
+	cmovzq	%r10,%rbp
+
+	movq	%r9,%rax
+	orq	%r11,%rax
+	bsrq	%rax,%rcx
+	leaq	1(%rcx),%rcx
+	cmovzq	%r8,%r9
+	cmovzq	%r10,%r11
+	cmovzq	%rax,%rcx
+	negq	%rcx
+
+
+	shldq	%cl,%rbx,%r9
+	shldq	%cl,%rbp,%r11
+
+	movl	$0x7FFFFFFF,%eax
+	andq	%rax,%r8
+	andq	%rax,%r10
+	andnq	%r9,%rax,%r9
+	andnq	%r11,%rax,%r11
+	orq	%r9,%r8
+	orq	%r11,%r10
+
+	jmp	__inner_loop_31
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.p2align	5
+__inner_loop_31:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$0x7FFFFFFF80000000,%rcx
+	movq	$0x800000007FFFFFFF,%r13
+	movq	$0x7FFFFFFF7FFFFFFF,%r15
+
+L$oop_31:
+	cmpq	%r10,%r8
+	movq	%r8,%rax
+	movq	%r10,%rbx
+	movq	%rcx,%rbp
+	movq	%r13,%r14
+	cmovbq	%r10,%r8
+	cmovbq	%rax,%r10
+	cmovbq	%r13,%rcx
+	cmovbq	%rbp,%r13
+
+	subq	%r10,%r8
+	subq	%r13,%rcx
+	addq	%r15,%rcx
+
+	testq	$1,%rax
+	cmovzq	%rax,%r8
+	cmovzq	%rbx,%r10
+	cmovzq	%rbp,%rcx
+	cmovzq	%r14,%r13
+
+	shrq	$1,%r8
+	addq	%r13,%r13
+	subq	%r15,%r13
+	subl	$1,%edi
+	jnz	L$oop_31
+
+	shrq	$32,%r15
+	movl	%ecx,%edx
+	movl	%r13d,%r12d
+	shrq	$32,%rcx
+	shrq	$32,%r13
+	subq	%r15,%rdx
+	subq	%r15,%rcx
+	subq	%r15,%r12
+	subq	%r15,%r13
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__inner_loop_62:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	$1,%rdx
+	xorq	%rcx,%rcx
+	xorq	%r12,%r12
+	movq	$1,%r13
+
+L$oop_62:
+	xorq	%rax,%rax
+	testq	$1,%r8
+	movq	%r10,%rbx
+	cmovnzq	%r10,%rax
+	subq	%r8,%rbx
+	movq	%r8,%rbp
+	subq	%rax,%r8
+	cmovcq	%rbx,%r8
+	cmovcq	%rbp,%r10
+	movq	%rdx,%rax
+	cmovcq	%r12,%rdx
+	cmovcq	%rax,%r12
+	movq	%rcx,%rbx
+	cmovcq	%r13,%rcx
+	cmovcq	%rbx,%r13
+	xorq	%rax,%rax
+	xorq	%rbx,%rbx
+	shrq	$1,%r8
+	testq	$1,%rbp
+	cmovnzq	%r12,%rax
+	cmovnzq	%r13,%rbx
+	addq	%r12,%r12
+	addq	%r13,%r13
+	subq	%rax,%rdx
+	subq	%rbx,%rcx
+	subl	$1,%edi
+	jnz	L$oop_62
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S
new file mode 100644
index 00000000000..5a5eb3a01d7
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/div3w-armv8.S
@@ -0,0 +1,88 @@
+.text
+
+.globl	_div_3_limbs
+
+.align	5
+_div_3_limbs:
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+Loop:
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csel	x4,x4,x6,lo	// select between R and R - D
+	extr	x1,x2,x1,#1	// D >>= 1
+	csel	x5,x5,x7,lo
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,Loop
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+
+.globl	_quot_rem_128
+
+.align	5
+_quot_rem_128:
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+
+
+.globl	_quot_rem_64
+
+.align	5
+_quot_rem_64:
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s
new file mode 100644
index 00000000000..8075571c87d
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/div3w-x86_64.s
@@ -0,0 +1,115 @@
+.text	
+
+.globl	_div_3_limbs
+.private_extern	_div_3_limbs
+
+.p2align	5
+_div_3_limbs:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	xorq	%rax,%rax
+	movl	$64,%ecx
+
+L$oop:
+	movq	%r8,%r10
+	subq	%rsi,%r8
+	movq	%r9,%r11
+	sbbq	%rdx,%r9
+	leaq	1(%rax,%rax,1),%rax
+	movq	%rdx,%rdi
+	cmovcq	%r10,%r8
+	cmovcq	%r11,%r9
+	sbbq	$0,%rax
+	shlq	$63,%rdi
+	shrq	$1,%rsi
+	shrq	$1,%rdx
+	orq	%rdi,%rsi
+	subl	$1,%ecx
+	jnz	L$oop
+
+	leaq	1(%rax,%rax,1),%rcx
+	sarq	$63,%rax
+
+	subq	%rsi,%r8
+	sbbq	%rdx,%r9
+	sbbq	$0,%rcx
+
+	orq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_quot_rem_128
+.private_extern	_quot_rem_128
+
+.p2align	5
+_quot_rem_128:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	movq	%rdx,%rcx
+
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	adcq	$0,%rdx
+
+	movq	0(%rdi),%r10
+	movq	8(%rdi),%r11
+	movq	16(%rdi),%rax
+
+	subq	%r8,%r10
+	sbbq	%r9,%r11
+	sbbq	%rdx,%rax
+	sbbq	%r8,%r8
+
+	addq	%r8,%rcx
+	movq	%r8,%r9
+	andq	0(%rsi),%r8
+	andq	8(%rsi),%r9
+	addq	%r8,%r10
+	adcq	%r9,%r11
+
+	movq	%r10,0(%rdi)
+	movq	%r11,8(%rdi)
+	movq	%rcx,16(%rdi)
+
+	movq	%rcx,%rax
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+
+
+
+.globl	_quot_rem_64
+.private_extern	_quot_rem_64
+
+.p2align	5
+_quot_rem_64:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rdx,%rax
+	imulq	0(%rsi),%rdx
+
+	movq	0(%rdi),%r10
+
+	subq	%rdx,%r10
+
+	movq	%r10,0(%rdi)
+	movq	%rax,8(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S
new file mode 100644
index 00000000000..4f506b58b0f
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S
@@ -0,0 +1,464 @@
+.text
+
+.globl	_mul_mont_sparse_256
+.private_extern	_mul_mont_sparse_256
+
+.align	5
+_mul_mont_sparse_256:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csel	x19,x19,x14,lo
+	csel	x20,x20,x15,lo
+	csel	x21,x21,x16,lo
+	csel	x22,x22,x17,lo
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+
+.globl	_sqr_mont_sparse_256
+.private_extern	_sqr_mont_sparse_256
+
+.align	5
+_sqr_mont_sparse_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+.long	3573752767
+	ret
+
+.globl	_from_mont_256
+.private_extern	_from_mont_256
+
+.align	5
+_from_mont_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+.long	3573752767
+	ret
+
+
+.globl	_redc_mont_256
+.private_extern	_redc_mont_256
+
+.align	5
+_redc_mont_256:
+.long	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csel	x10,x10,x14,lo
+	csel	x11,x11,x15,lo
+	csel	x12,x12,x16,lo
+	csel	x13,x13,x17,lo
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+.long	3573752767
+	ret
+
+
+
+.align	5
+__mul_by_1_mont_256:
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S
new file mode 100644
index 00000000000..5aa2e9f3ae7
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S
@@ -0,0 +1,2372 @@
+.text
+
+.globl	_add_mod_384x384
+
+.align	5
+_add_mod_384x384:
+.long	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+.long	3573752767
+	ret
+
+
+
+.align	5
+__add_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	adcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	adcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	adcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	stp	x11,x12,[x0,#48]
+	csel	x15,x15,x23,lo
+	stp	x13,x14,[x0,#64]
+	csel	x16,x16,x24,lo
+	stp	x15,x16,[x0,#80]
+
+	ret
+
+
+.globl	_sub_mod_384x384
+
+.align	5
+_sub_mod_384x384:
+.long	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+.long	3573752767
+	ret
+
+
+
+.align	5
+__sub_mod_384x384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	sbcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	sbcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	sbcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	ret
+
+
+
+.align	5
+__add_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	stp	x11,x12,[x0]
+	csel	x16,x16,x24,lo
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+
+.align	5
+__sub_mod_384:
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+.globl	_mul_mont_384x
+.private_extern	_mul_mont_384x
+
+.align	5
+_mul_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	x26,x0		// save r_ptr
+	mov	x27,x1		// save b_ptr
+	mov	x28,x2		// save b_ptr
+
+	sub	x0,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	x1,x1,#48	// mul_384(t1, a->im, b->im)
+	add	x2,x2,#48
+	add	x0,sp,#96
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	sub	x2,x1,#48
+	add	x0,sp,#240
+	bl	__add_mod_384
+
+	add	x1,x28,#0
+	add	x2,x28,#48
+	add	x0,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	x1,x0,#0
+	add	x2,x0,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,x0
+	add	x2,sp,#0
+	bl	__sub_mod_384x384
+
+	add	x2,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	x1,sp,#0
+	add	x2,sp,#96
+	add	x0,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	x1,sp,#0		// ret->re = redc(t0)
+	add	x0,x26,#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	x1,sp,#192		// ret->im = redc(t2)
+	add	x0,x0,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_sqr_mont_384x
+.private_extern	_sqr_mont_384x
+
+.align	5
+_sqr_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	add	x0,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	x0,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x11,x11,x11	// add with itself
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x19,x11,x19,lo
+	csel	x20,x12,x20,lo
+	csel	x21,x13,x21,lo
+	ldp	x11,x12,[sp]
+	csel	x22,x14,x22,lo
+	ldr	x17,        [sp,#48]
+	csel	x23,x15,x23,lo
+	ldp	x13,x14,[sp,#16]
+	csel	x24,x16,x24,lo
+	ldp	x15,x16,[sp,#32]
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	add	x2,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_mul_mont_384
+.private_extern	_mul_mont_384
+
+.align	5
+_mul_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+
+.align	5
+__mul_mont_384:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	mov	x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*1]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*2]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*3]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*4]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*5]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]	// pull r_ptr
+	adc	x17,x17,xzr
+
+	adds	x19,x20,x26
+	adcs	x20,x21,x27
+	adcs	x21,x22,x28
+	adcs	x22,x23,x0
+	adcs	x23,x24,x1
+	adcs	x24,x25,x3
+	adc	x25,x17,xzr
+
+	subs	x26,x19,x5
+	sbcs	x27,x20,x6
+	sbcs	x28,x21,x7
+	sbcs	x0,x22,x8
+	sbcs	x1,x23,x9
+	sbcs	x3,x24,x10
+	sbcs	xzr,    x25,xzr
+
+	csel	x11,x19,x26,lo
+	csel	x12,x20,x27,lo
+	csel	x13,x21,x28,lo
+	csel	x14,x22,x0,lo
+	csel	x15,x23,x1,lo
+	csel	x16,x24,x3,lo
+	ret
+
+
+.globl	_sqr_mont_384
+.private_extern	_sqr_mont_384
+
+.align	5
+_sqr_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x4,x3		// adjust for missing b_ptr
+
+	mov	x3,x0		// save r_ptr
+	mov	x0,sp
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	mov	x1,sp
+	mov	x0,x3		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_sqr_n_mul_mont_383
+.private_extern	_sqr_n_mul_mont_383
+
+.align	5
+_sqr_n_mul_mont_383:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x17,x5			// save b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+	mov	x0,sp
+Loop_sqr_383:
+	bl	__sqr_384
+	sub	x2,x2,#1	// counter
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// just accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	cbnz	x2,Loop_sqr_383
+
+	mov	x2,x17
+	ldr	x17,[x17]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.align	5
+__sqr_384:
+	mul	x19,x12,x11
+	mul	x20,x13,x11
+	mul	x21,x14,x11
+	mul	x22,x15,x11
+	mul	x23,x16,x11
+
+	umulh	x6,x12,x11
+	umulh	x7,x13,x11
+	umulh	x8,x14,x11
+	umulh	x9,x15,x11
+	adds	x20,x20,x6
+	umulh	x10,x16,x11
+	adcs	x21,x21,x7
+	mul	x7,x13,x12
+	adcs	x22,x22,x8
+	mul	x8,x14,x12
+	adcs	x23,x23,x9
+	mul	x9,x15,x12
+	adc	x24,xzr,    x10
+	mul	x10,x16,x12
+
+	adds	x21,x21,x7
+	umulh	x7,x13,x12
+	adcs	x22,x22,x8
+	umulh	x8,x14,x12
+	adcs	x23,x23,x9
+	umulh	x9,x15,x12
+	adcs	x24,x24,x10
+	umulh	x10,x16,x12
+	adc	x25,xzr,xzr
+
+	mul	x5,x11,x11
+	adds	x22,x22,x7
+	umulh	x11,  x11,x11
+	adcs	x23,x23,x8
+	mul	x8,x14,x13
+	adcs	x24,x24,x9
+	mul	x9,x15,x13
+	adc	x25,x25,x10
+	mul	x10,x16,x13
+
+	adds	x23,x23,x8
+	umulh	x8,x14,x13
+	adcs	x24,x24,x9
+	umulh	x9,x15,x13
+	adcs	x25,x25,x10
+	umulh	x10,x16,x13
+	adc	x26,xzr,xzr
+
+	mul	x6,x12,x12
+	adds	x24,x24,x8
+	umulh	x12,  x12,x12
+	adcs	x25,x25,x9
+	mul	x9,x15,x14
+	adc	x26,x26,x10
+	mul	x10,x16,x14
+
+	adds	x25,x25,x9
+	umulh	x9,x15,x14
+	adcs	x26,x26,x10
+	umulh	x10,x16,x14
+	adc	x27,xzr,xzr
+	mul	x7,x13,x13
+	adds	x26,x26,x9
+	umulh	x13,  x13,x13
+	adc	x27,x27,x10
+	mul	x8,x14,x14
+
+	mul	x10,x16,x15
+	umulh	x14,  x14,x14
+	adds	x27,x27,x10
+	umulh	x10,x16,x15
+	mul	x9,x15,x15
+	adc	x28,x10,xzr
+
+	adds	x19,x19,x19
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adcs	x22,x22,x22
+	adcs	x23,x23,x23
+	adcs	x24,x24,x24
+	adcs	x25,x25,x25
+	adcs	x26,x26,x26
+	umulh	x15,  x15,x15
+	adcs	x27,x27,x27
+	mul	x10,x16,x16
+	adcs	x28,x28,x28
+	umulh	x16,  x16,x16
+	adc	x1,xzr,xzr
+
+	adds	x19,x19,x11
+	adcs	x20,x20,x6
+	adcs	x21,x21,x12
+	adcs	x22,x22,x7
+	adcs	x23,x23,x13
+	adcs	x24,x24,x8
+	adcs	x25,x25,x14
+	stp	x5,x19,[x0]
+	adcs	x26,x26,x9
+	stp	x20,x21,[x0,#16]
+	adcs	x27,x27,x15
+	stp	x22,x23,[x0,#32]
+	adcs	x28,x28,x10
+	stp	x24,x25,[x0,#48]
+	adc	x16,x16,x1
+	stp	x26,x27,[x0,#64]
+	stp	x28,x16,[x0,#80]
+
+	ret
+
+.globl	_sqr_384
+.private_extern	_sqr_384
+
+.align	5
+_sqr_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_redc_mont_384
+.private_extern	_redc_mont_384
+
+.align	5
+_redc_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_from_mont_384
+.private_extern	_from_mont_384
+
+.align	5
+_from_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+
+.align	5
+__mul_by_1_mont_384:
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	mul	x26,x4,x11
+	ldp	x15,x16,[x1,#32]
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	ret
+
+
+
+.align	5
+__redc_tail_mont_384:
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csel	x11,x11,x19,lo
+	csel	x12,x12,x20,lo
+	csel	x13,x13,x21,lo
+	csel	x14,x14,x22,lo
+	csel	x15,x15,x23,lo
+	csel	x16,x16,x24,lo
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+
+
+.globl	_mul_384
+.private_extern	_mul_384
+
+.align	5
+_mul_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+
+.align	5
+__mul_384:
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+
+	umulh	x5,x11,x17
+	umulh	x6,x12,x17
+	umulh	x7,x13,x17
+	umulh	x8,x14,x17
+	umulh	x9,x15,x17
+	umulh	x10,x16,x17
+	ldr	x17,[x2,8*1]
+
+	str	x19,[x0]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,xzr,    x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(1+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*1]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(2+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*2]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(3+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*3]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(4+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*4]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*5]
+	adds	x19,x20,x5
+	adcs	x20,x21,x6
+	adcs	x21,x22,x7
+	adcs	x22,x23,x8
+	adcs	x23,x24,x9
+	adc	x24,x25,x10
+
+	stp	x19,x20,[x0,#48]
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ret
+
+
+.globl	_mul_382x
+.private_extern	_mul_382x
+
+.align	5
+_mul_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	x11,x12,[x1]
+	mov	x26,x0		// save r_ptr
+	ldp	x19,x20,[x1,#48]
+	mov	x27,x1		// save a_ptr
+	ldp	x13,x14,[x1,#16]
+	mov	x28,x2		// save b_ptr
+	ldp	x21,x22,[x1,#64]
+	ldp	x15,x16,[x1,#32]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x23,x24,[x1,#80]
+	adcs	x6,x12,x20
+	ldp	x11,x12,[x2]
+	adcs	x7,x13,x21
+	ldp	x19,x20,[x2,#48]
+	adcs	x8,x14,x22
+	ldp	x13,x14,[x2,#16]
+	adcs	x9,x15,x23
+	ldp	x21,x22,[x2,#64]
+	adc	x10,x16,x24
+	ldp	x15,x16,[x2,#32]
+
+	stp	x5,x6,[sp]
+	adds	x5,x11,x19	// t1 = b->re + b->im
+	ldp	x23,x24,[x2,#80]
+	adcs	x6,x12,x20
+	stp	x7,x8,[sp,#16]
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	stp	x9,x10,[sp,#32]
+	adcs	x9,x15,x23
+	stp	x5,x6,[sp,#48]
+	adc	x10,x16,x24
+	stp	x7,x8,[sp,#64]
+	stp	x9,x10,[sp,#80]
+
+	bl	__mul_384		// _mul_384(ret->re, a->re, b->re)
+
+	add	x1,sp,#0		// _mul_384(ret->im, t0, t1)
+	add	x2,sp,#48
+	add	x0,x26,#96
+	bl	__mul_384
+
+	add	x1,x27,#48	// _mul_384(tx, a->im, b->im)
+	add	x2,x28,#48
+	add	x0,sp,#0
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	add	x1,x26,#96	// ret->im -= tx
+	add	x2,sp,#0
+	add	x0,x26,#96
+	bl	__sub_mod_384x384
+
+	add	x2,x26,#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	x1,x26,#0	// ret->re -= tx
+	add	x2,sp,#0
+	add	x0,x26,#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_sqr_382x
+.private_extern	_sqr_382x
+
+.align	5
+_sqr_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x19,x20,[x1,#48]
+	ldp	x13,x14,[x1,#16]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x21,x22,[x1,#64]
+	adcs	x6,x12,x20
+	ldp	x15,x16,[x1,#32]
+	adcs	x7,x13,x21
+	ldp	x23,x24,[x1,#80]
+	adcs	x8,x14,x22
+	stp	x5,x6,[x0]
+	adcs	x9,x15,x23
+	ldp	x5,x6,[x2]
+	adc	x10,x16,x24
+	stp	x7,x8,[x0,#16]
+
+	subs	x11,x11,x19	// t1 = a->re - a->im
+	ldp	x7,x8,[x2,#16]
+	sbcs	x12,x12,x20
+	stp	x9,x10,[x0,#32]
+	sbcs	x13,x13,x21
+	ldp	x9,x10,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	adds	x11,x11,x19
+	and	x21,x7,x25
+	adcs	x12,x12,x20
+	and	x22,x8,x25
+	adcs	x13,x13,x21
+	and	x23,x9,x25
+	adcs	x14,x14,x22
+	and	x24,x10,x25
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	mov	x4,x1		// save a_ptr
+	add	x1,x0,#0	// _mul_384(ret->re, t0, t1)
+	add	x2,x0,#48
+	bl	__mul_384
+
+	add	x1,x4,#0		// _mul_384(ret->im, a->re, a->im)
+	add	x2,x4,#48
+	add	x0,x0,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x11,x12,[x0]
+	ldp	x13,x14,[x0,#16]
+	adds	x11,x11,x11	// add with itself
+	ldp	x15,x16,[x0,#32]
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adcs	x19,x19,x19
+	adcs	x20,x20,x20
+	stp	x11,x12,[x0]
+	adcs	x21,x21,x21
+	stp	x13,x14,[x0,#16]
+	adcs	x22,x22,x22
+	stp	x15,x16,[x0,#32]
+	adcs	x23,x23,x23
+	stp	x19,x20,[x0,#48]
+	adc	x24,x24,x24
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_sqr_mont_382x
+.private_extern	_sqr_mont_382x
+
+.align	5
+_sqr_mont_382x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x17,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x5,x11,x17	// t0 = a->re + a->im
+	adcs	x6,x12,x20
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	adcs	x9,x15,x23
+	adc	x10,x16,x24
+
+	subs	x19,x11,x17	// t1 = a->re - a->im
+	sbcs	x20,x12,x20
+	sbcs	x21,x13,x21
+	sbcs	x22,x14,x22
+	sbcs	x23,x15,x23
+	sbcs	x24,x16,x24
+	sbc	x25,xzr,xzr		// borrow flag as mask
+
+	stp	x5,x6,[sp]
+	stp	x7,x8,[sp,#16]
+	stp	x9,x10,[sp,#32]
+	stp	x19,x20,[sp,#48]
+	stp	x21,x22,[sp,#64]
+	stp	x23,x24,[sp,#80]
+	str	x25,[sp,#96]
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	bl	__mul_mont_383_nonred	// _mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x19,x11,x11	// add with itself
+	adcs	x20,x12,x12
+	adcs	x21,x13,x13
+	adcs	x22,x14,x14
+	adcs	x23,x15,x15
+	adc	x24,x16,x16
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	ldp	x11,x12,[sp]
+	ldr	x17,[sp,#48]
+	ldp	x13,x14,[sp,#16]
+	ldp	x15,x16,[sp,#32]
+
+	add	x2,sp,#48
+	bl	__mul_mont_383_nonred	// _mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	x25,[sp,#96]	// account for sign from a->re - a->im
+	ldp	x19,x20,[sp]
+	ldp	x21,x22,[sp,#16]
+	ldp	x23,x24,[sp,#32]
+
+	and	x19,x19,x25
+	and	x20,x20,x25
+	and	x21,x21,x25
+	and	x22,x22,x25
+	and	x23,x23,x25
+	and	x24,x24,x25
+
+	subs	x11,x11,x19
+	sbcs	x12,x12,x20
+	sbcs	x13,x13,x21
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	and	x21,x7,x25
+	and	x22,x8,x25
+	and	x23,x9,x25
+	and	x24,x10,x25
+
+	adds	x11,x11,x19
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+
+.align	5
+__mul_mont_383_nonred:
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*1]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*2]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*3]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*4]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*5]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]		// pull r_ptr
+
+	adds	x11,x20,x26
+	adcs	x12,x21,x27
+	adcs	x13,x22,x28
+	adcs	x14,x23,x0
+	adcs	x15,x24,x1
+	adcs	x16,x25,x3
+
+	ret
+
+
+.globl	_sgn0_pty_mont_384
+.private_extern	_sgn0_pty_mont_384
+
+.align	5
+_sgn0_pty_mont_384:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	adds	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
+
+.globl	_sgn0_pty_mont_384x
+.private_extern	_sgn0_pty_mont_384x
+
+.align	5
+_sgn0_pty_mont_384x:
+.long	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	add	x1,x1,#48
+
+	and	x2,x11,#1
+	orr	x3,x11,x12
+	adds	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	orr	x3,x3,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x2,x2,x17
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	orr	x1,x11,x12
+	adds	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	orr	x1,x1,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	cmp	x3,#0
+	csel	x3,x0,x2,eq	// a->re==0? prty(a->im) : prty(a->re)
+
+	cmp	x1,#0
+	csel	x1,x0,x2,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+.long	3573752767
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
new file mode 100644
index 00000000000..d83f5440342
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
@@ -0,0 +1,706 @@
+.text	
+
+.globl	_mul_mont_sparse_256
+.private_extern	_mul_mont_sparse_256
+
+.p2align	5
+_mul_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r13
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%rbp
+	movq	%rdx,%rbx
+
+	movq	%rax,%r15
+	mulq	%r13
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sqr_mont_sparse_256
+.private_extern	_sqr_mont_sparse_256
+
+.p2align	5
+_sqr_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	0(%rsi),%rax
+	movq	%rcx,%r8
+	movq	8(%rsi),%r14
+	movq	%rdx,%rcx
+	movq	16(%rsi),%r12
+	leaq	(%rsi),%rbx
+	movq	24(%rsi),%rbp
+
+	movq	%rax,%r15
+	mulq	%rax
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+	call	__mulq_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulq_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	xorq	%r14,%r14
+	movq	%rdx,%r13
+
+	movq	%r9,%rdi
+	imulq	%r8,%r9
+
+
+	movq	%rax,%r15
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	xorq	%r15,%r15
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r9,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rdx,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	movq	%r10,%rdi
+	imulq	%r8,%r10
+
+
+	movq	%rax,%r9
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	xorq	%r9,%r9
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r10,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rdx,%r14
+	adcq	$0,%r15
+	adcq	$0,%r9
+	movq	%r11,%rdi
+	imulq	%r8,%r11
+
+
+	movq	%rax,%r10
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r9
+	xorq	%r10,%r10
+
+
+	mulq	0(%rcx)
+	addq	%rax,%rdi
+	movq	%r11,%rax
+	adcq	%rdx,%rdi
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rdi,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rdx,%r15
+	adcq	$0,%r9
+	adcq	$0,%r10
+	imulq	%r8,%rax
+	movq	8(%rsp),%rsi
+
+
+	movq	%rax,%r11
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	movq	%r14,%rbx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+
+	movq	%r15,%r12
+	subq	0(%rcx),%r13
+	sbbq	8(%rcx),%r14
+	sbbq	16(%rcx),%r15
+	movq	%r9,%rbp
+	sbbq	24(%rcx),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rax,%r13
+	cmovcq	%rbx,%r14
+	cmovcq	%r12,%r15
+	movq	%r13,0(%rsi)
+	cmovcq	%rbp,%r9
+	movq	%r14,8(%rsi)
+	movq	%r15,16(%rsi)
+	movq	%r9,24(%rsi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_from_mont_256
+.private_extern	_from_mont_256
+
+.p2align	5
+_from_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	movq	%r14,%r10
+	movq	%r15,%r11
+	movq	%r9,%r12
+
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	sbbq	24(%rbx),%r9
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_redc_mont_256
+.private_extern	_redc_mont_256
+
+.p2align	5
+_redc_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_256
+
+	addq	32(%rsi),%r13
+	adcq	40(%rsi),%r14
+	movq	%r13,%rax
+	adcq	48(%rsi),%r15
+	movq	%r14,%r10
+	adcq	56(%rsi),%r9
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r15,%r11
+	subq	0(%rbx),%r13
+	sbbq	8(%rbx),%r14
+	sbbq	16(%rbx),%r15
+	movq	%r9,%r12
+	sbbq	24(%rbx),%r9
+	sbbq	$0,%rsi
+
+	cmovncq	%r13,%rax
+	cmovncq	%r14,%r10
+	cmovncq	%r15,%r11
+	movq	%rax,0(%rdi)
+	cmovncq	%r9,%r12
+	movq	%r10,8(%rdi)
+	movq	%r11,16(%rdi)
+	movq	%r12,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulq_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	movq	%rax,%r13
+	imulq	%rcx,%rax
+	movq	%rax,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	16(%rbx)
+	movq	%r10,%r14
+	imulq	%rcx,%r10
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r9
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
new file mode 100644
index 00000000000..0d8ac89cfc2
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
@@ -0,0 +1,3612 @@
+.text	
+
+
+
+
+
+
+
+
+.p2align	5
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_mul_mont_384x
+.private_extern	_mul_mont_384x
+
+.p2align	5
+_mul_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	48(%rsi),%rsi
+	leaq	40+96(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	-48(%rsi),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulq_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	movq	%rcx,%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqr_mont_384x
+.private_extern	_sqr_mont_384x
+
+.p2align	5
+_sqr_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,8(%rsp)
+	movq	%rsi,16(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	call	__mulq_mont_384
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	movq	%r14,%r12
+	adcq	%r9,%r9
+	movq	%r15,%r13
+	adcq	%r10,%r10
+	movq	%r8,%rax
+	adcq	%r11,%r11
+	movq	%r9,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r10,%rbp
+	sbbq	16(%rcx),%r8
+	sbbq	24(%rcx),%r9
+	sbbq	32(%rcx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r14
+	cmovcq	%r13,%r15
+	cmovcq	%rax,%r8
+	movq	%r14,48(%rdi)
+	cmovcq	%rbx,%r9
+	movq	%r15,56(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,64(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_mul_382x
+.private_extern	_mul_382x
+
+.p2align	5
+_mul_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulq_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulq_384
+
+
+	leaq	48(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulq_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqr_382x
+.private_extern	_sqr_382x
+
+.p2align	5
+_sqr_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulq_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulq_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_mul_384
+.private_extern	_mul_384
+
+.p2align	5
+_mul_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+
+
+	movq	%rdx,%rbx
+	call	__mulq_384
+
+	movq	0(%rsp),%r12
+.cfi_restore	%r12
+	movq	8(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	16(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	24(%rsp),%rsp
+.cfi_adjust_cfa_offset	-24
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__mulq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rax
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	movq	%rax,0(%rdi)
+	movq	%rbp,%rax
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,8(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,16(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,24(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,32(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%rcx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rcx,40(%rdi)
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%rax,%r12
+	movq	%rax,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+	movq	%rcx,48(%rdi)
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+	movq	%r10,72(%rdi)
+	movq	%r11,80(%rdi)
+	movq	%r12,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_sqr_384
+.private_extern	_sqr_384
+
+.p2align	5
+_sqr_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrq_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__sqrq_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rcx
+	movq	24(%rsi),%rbx
+
+
+	movq	%rax,%r14
+	mulq	%r15
+	movq	%rax,%r9
+	movq	%r14,%rax
+	movq	32(%rsi),%rbp
+	movq	%rdx,%r10
+
+	mulq	%rcx
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	40(%rsi),%rsi
+	movq	%rdx,%r11
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	%rbp
+	addq	%rax,%r12
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	%rsi
+	addq	%rax,%r13
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	%rax
+	xorq	%r8,%r8
+	movq	%rax,0(%rdi)
+	movq	%r15,%rax
+	addq	%r9,%r9
+	adcq	$0,%r8
+	addq	%rdx,%r9
+	adcq	$0,%r8
+	movq	%r9,8(%rdi)
+
+	mulq	%rcx
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	%rsi
+	addq	%rax,%r14
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rax
+	xorq	%r9,%r9
+	addq	%rax,%r8
+	movq	%rcx,%rax
+	addq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%r9
+	addq	%r8,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r9
+	movq	%r10,16(%rdi)
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%r11,24(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rbp
+	addq	%rax,%r14
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%r15
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%rbx,%rax
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	$0,%r11
+	addq	%r9,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r11
+	movq	%r12,32(%rdi)
+
+
+	mulq	%rbp
+	addq	%rax,%r15
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r13,40(%rdi)
+	movq	%rdx,%r8
+
+	mulq	%rsi
+	addq	%rax,%rcx
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r8,%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%rbx
+
+	mulq	%rax
+	xorq	%r12,%r12
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	$0,%r12
+	addq	%r11,%r14
+	adcq	%rdx,%r15
+	movq	%r14,48(%rdi)
+	adcq	$0,%r12
+	movq	%r15,56(%rdi)
+
+
+	mulq	%rsi
+	addq	%rax,%rbx
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	xorq	%r13,%r13
+	addq	%rax,%r12
+	movq	%rsi,%rax
+	addq	%rcx,%rcx
+	adcq	%rbx,%rbx
+	adcq	$0,%r13
+	addq	%r12,%rcx
+	adcq	%rdx,%rbx
+	movq	%rcx,64(%rdi)
+	adcq	$0,%r13
+	movq	%rbx,72(%rdi)
+
+
+	mulq	%rax
+	addq	%r13,%rax
+	addq	%rbp,%rbp
+	adcq	$0,%rdx
+	addq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rax,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_sqr_mont_384
+.private_extern	_sqr_mont_384
+
+.p2align	5
+_sqr_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$120,%rsp
+.cfi_adjust_cfa_offset	8*15
+
+
+	movq	%rcx,96(%rsp)
+	movq	%rdx,104(%rsp)
+	movq	%rdi,112(%rsp)
+
+	movq	%rsp,%rdi
+	call	__sqrq_384
+
+	leaq	0(%rsp),%rsi
+	movq	96(%rsp),%rcx
+	movq	104(%rsp),%rbx
+	movq	112(%rsp),%rdi
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	120(%rsp),%r8
+	movq	120(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*21
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+
+.globl	_redc_mont_384
+.private_extern	_redc_mont_384
+
+.p2align	5
+_redc_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+
+
+.globl	_from_mont_384
+.private_extern	_from_mont_384
+
+.p2align	5
+_from_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulq_by_1_mont_384
+
+
+
+
+
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulq_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%r9,%r15
+	imulq	%rcx,%r9
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	32(%rbx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	40(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%r10,%r8
+	imulq	%rcx,%r10
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rbx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r8
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rbx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%r11,%r9
+	imulq	%rcx,%r11
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rbx)
+	addq	%rax,%r9
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rbx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rbx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_sgn0_pty_mont_384
+.private_extern	_sgn0_pty_mont_384
+
+.p2align	5
+_sgn0_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sgn0_pty_mont_384x
+.private_extern	_sgn0_pty_mont_384x
+
+.p2align	5
+_sgn0_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulq_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_mul_mont_384
+.private_extern	_mul_mont_384
+
+.p2align	5
+_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$24,%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	%rdx,%rbx
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+
+	call	__mulq_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-72
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulq_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rdi
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rdi,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%rbp
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r15,%r15
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r8,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	%rdx,%r14
+	adcq	$0,%r15
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r9,%rbp
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r9,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	%rdx,%r15
+	adcq	$0,%r8
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r10,%rbp
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r10,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r11,%rbp
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r11,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r12,%rbp
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r12,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	%rdx,%r10
+	adcq	$0,%r11
+
+	movq	%rax,%rdi
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	movq	%r13,%rbp
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rdi,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rsi)
+	addq	%r12,%r10
+	adcq	$0,%rdx
+	xorq	%r12,%r12
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+	mulq	0(%rcx)
+	addq	%rax,%rbp
+	movq	%r13,%rax
+	adcq	%rdx,%rbp
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	24(%rcx)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r10
+	adcq	%rdx,%r11
+	adcq	$0,%r12
+
+
+
+
+	movq	16(%rsp),%rdi
+	subq	0(%rcx),%r14
+	movq	%r15,%rdx
+	sbbq	8(%rcx),%r15
+	movq	%r8,%rbx
+	sbbq	16(%rcx),%r8
+	movq	%r9,%rsi
+	sbbq	24(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	32(%rcx),%r10
+	movq	%r11,%r13
+	sbbq	40(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rdx,%r15
+	cmovcq	%rbx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rsi,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%rbp,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%r13,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_sqr_n_mul_mont_384
+.private_extern	_sqr_n_mul_mont_384
+
+.p2align	5
+_sqr_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+L$oop_sqr_384:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	%xmm1,%edx
+	leaq	0(%rdi),%rsi
+	decl	%edx
+	jnz	L$oop_sqr_384
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sqr_n_mul_mont_383
+.private_extern	_sqr_n_mul_mont_383
+
+.p2align	5
+_sqr_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	8*17
+
+
+	movq	%r8,0(%rsp)
+	movq	%rdi,8(%rsp)
+	movq	%rcx,16(%rsp)
+	leaq	32(%rsp),%rdi
+	movq	%r9,24(%rsp)
+	movq	(%r9),%xmm2
+
+L$oop_sqr_383:
+	movd	%edx,%xmm1
+
+	call	__sqrq_384
+
+	leaq	0(%rdi),%rsi
+	movq	0(%rsp),%rcx
+	movq	16(%rsp),%rbx
+	call	__mulq_by_1_mont_384
+
+	movd	%xmm1,%edx
+	addq	48(%rsi),%r14
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	leaq	0(%rdi),%rsi
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	decl	%edx
+	jnz	L$oop_sqr_383
+
+.byte	102,72,15,126,208
+	movq	%rbx,%rcx
+	movq	24(%rsp),%rbx
+
+
+
+
+
+
+	movq	%r8,%r12
+	movq	%r9,%r13
+
+	call	__mulq_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	136(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-8*23
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulq_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	%rax,%rbp
+	mulq	%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r12
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r8,%r15
+	imulq	8(%rsp),%r8
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	mulq	40(%rsi)
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rcx)
+	addq	%rax,%r15
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rcx)
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rcx)
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rcx)
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rcx)
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rcx)
+	addq	%rax,%r13
+	movq	8(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	%rdx,%r14
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	8(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+	mulq	24(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	32(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	40(%rsi)
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rcx)
+	addq	%rax,%r8
+	movq	%r9,%rax
+	adcq	%rdx,%r8
+
+	mulq	8(%rcx)
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rcx)
+	addq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	24(%rcx)
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rcx)
+	addq	%rax,%r13
+	movq	%r9,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rcx)
+	addq	%rax,%r14
+	movq	16(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	%rdx,%r15
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	8(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	16(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+	mulq	24(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	32(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r8,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	40(%rsi)
+	addq	%r8,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r8
+
+	mulq	0(%rcx)
+	addq	%rax,%r9
+	movq	%r10,%rax
+	adcq	%rdx,%r9
+
+	mulq	8(%rcx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rcx)
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	24(%rcx)
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rcx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rcx)
+	addq	%rax,%r15
+	movq	24(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	%rdx,%r8
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	16(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+	mulq	24(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	32(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r9,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	40(%rsi)
+	addq	%r9,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r9
+
+	mulq	0(%rcx)
+	addq	%rax,%r10
+	movq	%r11,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rcx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rcx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rcx)
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	addq	%rax,%r14
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rcx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rcx)
+	addq	%rax,%r8
+	movq	32(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	%rdx,%r9
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	8(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+	mulq	24(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	32(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	40(%rsi)
+	addq	%r10,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rcx)
+	addq	%rax,%r11
+	movq	%r12,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rcx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rcx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rcx)
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	addq	%rax,%r15
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rcx)
+	addq	%rax,%r8
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rcx)
+	addq	%rax,%r9
+	movq	40(%rbx),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	%rdx,%r10
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	8(%rsi)
+	addq	%rax,%r14
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rsi)
+	addq	%rax,%r15
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	movq	%r13,%r12
+	imulq	8(%rsp),%r13
+
+	mulq	24(%rsi)
+	addq	%rax,%r8
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r8
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	32(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	40(%rsi)
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	0(%rcx)
+	addq	%rax,%r12
+	movq	%r13,%rax
+	adcq	%rdx,%r12
+
+	mulq	8(%rcx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	16(%rcx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	24(%rcx)
+	addq	%r12,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	32(%rcx)
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r9
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+	mulq	40(%rcx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	adcq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_sqr_mont_382x
+.private_extern	_sqr_mont_382x
+
+.p2align	5
+_sqr_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rsi,16(%rsp)
+	movq	%rdi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rax
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	24(%rsp),%rdi
+	call	__mulq_mont_383_nonred
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	movq	%r8,64(%rdi)
+	movq	%r9,72(%rdi)
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rax
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%r12
+	movq	32+24(%rsp),%r13
+
+	call	__mulq_mont_383_nonred
+	movq	32+96(%rsp),%rsi
+	movq	32+0(%rsp),%r12
+	movq	32+8(%rsp),%r13
+	andq	%rsi,%r12
+	movq	32+16(%rsp),%rax
+	andq	%rsi,%r13
+	movq	32+24(%rsp),%rbx
+	andq	%rsi,%rax
+	movq	32+32(%rsp),%rbp
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	32+40(%rsp),%rsi
+
+	subq	%r12,%r14
+	movq	0(%rcx),%r12
+	sbbq	%r13,%r15
+	movq	8(%rcx),%r13
+	sbbq	%rax,%r8
+	movq	16(%rcx),%rax
+	sbbq	%rbx,%r9
+	movq	24(%rcx),%rbx
+	sbbq	%rbp,%r10
+	movq	32(%rcx),%rbp
+	sbbq	%rsi,%r11
+	sbbq	%rsi,%rsi
+
+	andq	%rsi,%r12
+	andq	%rsi,%r13
+	andq	%rsi,%rax
+	andq	%rsi,%rbx
+	andq	%rsi,%rbp
+	andq	40(%rcx),%rsi
+
+	addq	%r12,%r14
+	adcq	%r13,%r15
+	adcq	%rax,%r8
+	adcq	%rbx,%r9
+	adcq	%rbp,%r10
+	adcq	%rsi,%r11
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
new file mode 100644
index 00000000000..178372f41b2
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
@@ -0,0 +1,619 @@
+.text	
+
+.globl	_mulx_mont_sparse_256
+.private_extern	_mulx_mont_sparse_256
+
+.p2align	5
+_mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sqrx_mont_sparse_256
+.private_extern	_sqrx_mont_sparse_256
+
+.p2align	5
+_sqrx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	movq	%rcx,%r8
+	movq	%rdx,%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rbp
+	movq	24(%rsi),%r9
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%rax,%r11
+	call	__mulx_mont_sparse_256
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulx_mont_sparse_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	mulxq	%r15,%r15,%r12
+	mulxq	%rbp,%rbp,%r13
+	addq	%r15,%r11
+	mulxq	%r9,%r9,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rbp,%r12
+	adcq	%r9,%r13
+	adcq	$0,%r14
+
+	movq	%rax,%r10
+	imulq	%r8,%rax
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r11
+	adcxq	%r9,%r12
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r14
+	adcxq	%r15,%r9
+	adoxq	%r9,%r15
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r10
+	adoxq	%r11,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r12
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r12
+	adoxq	%r9,%r13
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	16(%rbx),%rdx
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+	adcxq	%r10,%r14
+	adoxq	%r10,%r15
+	adcxq	%r10,%r15
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+	movq	%rax,%r11
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r12
+	adcxq	%r9,%r13
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r15
+	adcxq	%r10,%r9
+	adoxq	%r9,%r10
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r11
+	adoxq	%r12,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r13
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r13
+	adoxq	%r9,%r14
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	24(%rbx),%rdx
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+	adcxq	%r11,%r15
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	adoxq	%r11,%r11
+	adcq	$0,%r11
+	movq	%rax,%r12
+	imulq	%r8,%rax
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r14
+
+	mulxq	8+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r14
+	adcxq	%r9,%r15
+
+	mulxq	16+128(%rsi),%rbp,%r9
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r10
+
+	mulxq	24+128(%rsi),%rbp,%r9
+	movq	%rax,%rdx
+	adoxq	%rbp,%r10
+	adcxq	%r11,%r9
+	adoxq	%r9,%r11
+
+
+	mulxq	0+128(%rcx),%rbp,%rax
+	adcxq	%rbp,%r12
+	adoxq	%r13,%rax
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%rax
+	adoxq	%r9,%r14
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%rax,%rdx
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	adoxq	%r12,%r12
+	adcq	$0,%r12
+	imulq	%r8,%rdx
+
+
+	xorq	%rbp,%rbp
+	mulxq	0+128(%rcx),%r13,%r9
+	adcxq	%rax,%r13
+	adoxq	%r9,%r14
+
+	mulxq	8+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r14
+	adoxq	%r9,%r15
+
+	mulxq	16+128(%rcx),%rbp,%r9
+	adcxq	%rbp,%r15
+	adoxq	%r9,%r10
+
+	mulxq	24+128(%rcx),%rbp,%r9
+	movq	%r14,%rdx
+	leaq	128(%rcx),%rcx
+	adcxq	%rbp,%r10
+	adoxq	%r9,%r11
+	movq	%r15,%rax
+	adcxq	%r13,%r11
+	adoxq	%r13,%r12
+	adcq	$0,%r12
+
+
+
+
+	movq	%r10,%rbp
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	sbbq	16(%rcx),%r10
+	movq	%r11,%r9
+	sbbq	24(%rcx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rdx,%r14
+	cmovcq	%rax,%r15
+	cmovcq	%rbp,%r10
+	movq	%r14,0(%rdi)
+	cmovcq	%r9,%r11
+	movq	%r15,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_fromx_mont_256
+.private_extern	_fromx_mont_256
+
+.p2align	5
+_fromx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	movq	%r15,%rdx
+	movq	%r10,%r12
+	movq	%r11,%r13
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	sbbq	24(%rbx),%r11
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_redcx_mont_256
+.private_extern	_redcx_mont_256
+
+.p2align	5
+_redcx_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_256
+
+	addq	32(%rsi),%r14
+	adcq	40(%rsi),%r15
+	movq	%r14,%rax
+	adcq	48(%rsi),%r10
+	movq	%r15,%rdx
+	adcq	56(%rsi),%r11
+	sbbq	%rsi,%rsi
+
+
+
+
+	movq	%r10,%r12
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r10
+	movq	%r11,%r13
+	sbbq	24(%rbx),%r11
+	sbbq	$0,%rsi
+
+	cmovncq	%r14,%rax
+	cmovncq	%r15,%rdx
+	cmovncq	%r10,%r12
+	movq	%rax,0(%rdi)
+	cmovncq	%r11,%r13
+	movq	%rdx,8(%rdi)
+	movq	%r12,16(%rdi)
+	movq	%r13,24(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulx_by_1_mont_256:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r11
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+
+	movq	%rax,%r14
+	imulq	%rcx,%rax
+	movq	%rax,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r14
+	movq	%r10,%rax
+	adcq	%rdx,%r14
+
+	mulq	8(%rbx)
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	16(%rbx)
+	movq	%r11,%r15
+	imulq	%rcx,%r11
+	addq	%rax,%r12
+	movq	%r10,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	24(%rbx)
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r14
+
+	mulq	0(%rbx)
+	addq	%rax,%r15
+	movq	%r11,%rax
+	adcq	%rdx,%r15
+
+	mulq	8(%rbx)
+	addq	%rax,%r12
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r12
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	16(%rbx)
+	movq	%r12,%r10
+	imulq	%rcx,%r12
+	addq	%rax,%r13
+	movq	%r11,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	24(%rbx)
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	0(%rbx)
+	addq	%rax,%r10
+	movq	%r12,%rax
+	adcq	%rdx,%r10
+
+	mulq	8(%rbx)
+	addq	%rax,%r13
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rbx)
+	movq	%r13,%r11
+	imulq	%rcx,%r13
+	addq	%rax,%r14
+	movq	%r12,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	24(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r10,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	0(%rbx)
+	addq	%rax,%r11
+	movq	%r13,%rax
+	adcq	%rdx,%r11
+
+	mulq	8(%rbx)
+	addq	%rax,%r14
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	16(%rbx)
+	addq	%rax,%r15
+	movq	%r13,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r15
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	24(%rbx)
+	addq	%rax,%r10
+	movq	%r14,%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
new file mode 100644
index 00000000000..95d3dadcc67
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
@@ -0,0 +1,2960 @@
+.text	
+
+
+
+
+
+
+
+
+.p2align	5
+__sub_mod_384x384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	movq	48(%rsi),%r14
+
+	subq	0(%rdx),%r8
+	movq	56(%rsi),%r15
+	sbbq	8(%rdx),%r9
+	movq	64(%rsi),%rax
+	sbbq	16(%rdx),%r10
+	movq	72(%rsi),%rbx
+	sbbq	24(%rdx),%r11
+	movq	80(%rsi),%rbp
+	sbbq	32(%rdx),%r12
+	movq	88(%rsi),%rsi
+	sbbq	40(%rdx),%r13
+	movq	%r8,0(%rdi)
+	sbbq	48(%rdx),%r14
+	movq	0(%rcx),%r8
+	movq	%r9,8(%rdi)
+	sbbq	56(%rdx),%r15
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	sbbq	64(%rdx),%rax
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	sbbq	72(%rdx),%rbx
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	sbbq	80(%rdx),%rbp
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	sbbq	88(%rdx),%rsi
+	movq	40(%rcx),%r13
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r8
+	andq	%rdx,%r9
+	andq	%rdx,%r10
+	andq	%rdx,%r11
+	andq	%rdx,%r12
+	andq	%rdx,%r13
+
+	addq	%r8,%r14
+	adcq	%r9,%r15
+	movq	%r14,48(%rdi)
+	adcq	%r10,%rax
+	movq	%r15,56(%rdi)
+	adcq	%r11,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%r12,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%r13,%rsi
+	movq	%rbp,80(%rdi)
+	movq	%rsi,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__add_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	movq	%r8,%r14
+	adcq	24(%rdx),%r11
+	movq	%r9,%r15
+	adcq	32(%rdx),%r12
+	movq	%r10,%rax
+	adcq	40(%rdx),%r13
+	movq	%r11,%rbx
+	sbbq	%rdx,%rdx
+
+	subq	0(%rcx),%r8
+	sbbq	8(%rcx),%r9
+	movq	%r12,%rbp
+	sbbq	16(%rcx),%r10
+	sbbq	24(%rcx),%r11
+	sbbq	32(%rcx),%r12
+	movq	%r13,%rsi
+	sbbq	40(%rcx),%r13
+	sbbq	$0,%rdx
+
+	cmovcq	%r14,%r8
+	cmovcq	%r15,%r9
+	cmovcq	%rax,%r10
+	movq	%r8,0(%rdi)
+	cmovcq	%rbx,%r11
+	movq	%r9,8(%rdi)
+	cmovcq	%rbp,%r12
+	movq	%r10,16(%rdi)
+	cmovcq	%rsi,%r13
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__sub_mod_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+__sub_mod_384_a_is_loaded:
+	subq	0(%rdx),%r8
+	movq	0(%rcx),%r14
+	sbbq	8(%rdx),%r9
+	movq	8(%rcx),%r15
+	sbbq	16(%rdx),%r10
+	movq	16(%rcx),%rax
+	sbbq	24(%rdx),%r11
+	movq	24(%rcx),%rbx
+	sbbq	32(%rdx),%r12
+	movq	32(%rcx),%rbp
+	sbbq	40(%rdx),%r13
+	movq	40(%rcx),%rsi
+	sbbq	%rdx,%rdx
+
+	andq	%rdx,%r14
+	andq	%rdx,%r15
+	andq	%rdx,%rax
+	andq	%rdx,%rbx
+	andq	%rdx,%rbp
+	andq	%rdx,%rsi
+
+	addq	%r14,%r8
+	adcq	%r15,%r9
+	movq	%r8,0(%rdi)
+	adcq	%rax,%r10
+	movq	%r9,8(%rdi)
+	adcq	%rbx,%r11
+	movq	%r10,16(%rdi)
+	adcq	%rbp,%r12
+	movq	%r11,24(%rdi)
+	adcq	%rsi,%r13
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_mulx_mont_384x
+.private_extern	_mulx_mont_384x
+
+.p2align	5
+_mulx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$328,%rsp
+.cfi_adjust_cfa_offset	328
+
+
+	movq	%rdx,%rbx
+	movq	%rdi,32(%rsp)
+	movq	%rsi,24(%rsp)
+	movq	%rdx,16(%rsp)
+	movq	%rcx,8(%rsp)
+	movq	%r8,0(%rsp)
+
+
+
+
+	leaq	40(%rsp),%rdi
+	call	__mulx_384
+
+
+	leaq	48(%rbx),%rbx
+	leaq	128+48(%rsi),%rsi
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	8(%rsp),%rcx
+	leaq	(%rbx),%rsi
+	leaq	-48(%rbx),%rdx
+	leaq	40+192+48(%rsp),%rdi
+	call	__add_mod_384
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	-48(%rdi),%rdi
+	call	__add_mod_384
+
+	leaq	(%rdi),%rbx
+	leaq	48(%rdi),%rsi
+	call	__mulx_384
+
+
+	leaq	(%rdi),%rsi
+	leaq	40(%rsp),%rdx
+	movq	8(%rsp),%rcx
+	call	__sub_mod_384x384
+
+	leaq	(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	40(%rsp),%rsi
+	leaq	40+96(%rsp),%rdx
+	leaq	40(%rsp),%rdi
+	call	__sub_mod_384x384
+
+	leaq	(%rcx),%rbx
+
+
+	leaq	40(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	movq	32(%rsp),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	leaq	40+192(%rsp),%rsi
+	movq	0(%rsp),%rcx
+	leaq	48(%rdi),%rdi
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	leaq	328(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-328-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqrx_mont_384x
+.private_extern	_sqrx_mont_384x
+
+.p2align	5
+_sqrx_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	32(%rsp),%rdi
+	call	__add_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rdx
+	leaq	32+48(%rsp),%rdi
+	call	__sub_mod_384
+
+
+	movq	24(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	movq	%rdx,%r8
+	adcq	%r12,%r12
+	movq	%r15,%r9
+	adcq	%rdi,%rdi
+	movq	%rax,%r10
+	adcq	%rbp,%rbp
+	movq	%r12,%r11
+	sbbq	%rsi,%rsi
+
+	subq	0(%rcx),%rdx
+	sbbq	8(%rcx),%r15
+	movq	%rdi,%r13
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r12
+	sbbq	32(%rcx),%rdi
+	movq	%rbp,%r14
+	sbbq	40(%rcx),%rbp
+	sbbq	$0,%rsi
+
+	cmovcq	%r8,%rdx
+	cmovcq	%r9,%r15
+	cmovcq	%r10,%rax
+	movq	%rdx,48(%rbx)
+	cmovcq	%r11,%r12
+	movq	%r15,56(%rbx)
+	cmovcq	%r13,%rdi
+	movq	%rax,64(%rbx)
+	cmovcq	%r14,%rbp
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_mulx_382x
+.private_extern	_mulx_382x
+
+.p2align	5
+_mulx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	leaq	96(%rdi),%rdi
+	movq	%rsi,0(%rsp)
+	movq	%rdx,8(%rsp)
+	movq	%rdi,16(%rsp)
+	movq	%rcx,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	addq	48(%rsi),%r8
+	adcq	56(%rsi),%r9
+	adcq	64(%rsi),%r10
+	adcq	72(%rsi),%r11
+	adcq	80(%rsi),%r12
+	adcq	88(%rsi),%r13
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	movq	32(%rdx),%r12
+	movq	40(%rdx),%r13
+
+	addq	48(%rdx),%r8
+	adcq	56(%rdx),%r9
+	adcq	64(%rdx),%r10
+	adcq	72(%rdx),%r11
+	adcq	80(%rdx),%r12
+	adcq	88(%rdx),%r13
+
+	movq	%r8,32+48(%rsp)
+	movq	%r9,32+56(%rsp)
+	movq	%r10,32+64(%rsp)
+	movq	%r11,32+72(%rsp)
+	movq	%r12,32+80(%rsp)
+	movq	%r13,32+88(%rsp)
+
+
+	leaq	32+0(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+	call	__mulx_384
+
+
+	movq	0(%rsp),%rsi
+	movq	8(%rsp),%rbx
+	leaq	-96(%rdi),%rdi
+	call	__mulx_384
+
+
+	leaq	48+128(%rsi),%rsi
+	leaq	48(%rbx),%rbx
+	leaq	32(%rsp),%rdi
+	call	__mulx_384
+
+
+	movq	16(%rsp),%rsi
+	leaq	32(%rsp),%rdx
+	movq	24(%rsp),%rcx
+	movq	%rsi,%rdi
+	call	__sub_mod_384x384
+
+
+	leaq	0(%rdi),%rsi
+	leaq	-96(%rdi),%rdx
+	call	__sub_mod_384x384
+
+
+	leaq	-96(%rdi),%rsi
+	leaq	32(%rsp),%rdx
+	leaq	-96(%rdi),%rdi
+	call	__sub_mod_384x384
+
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqrx_382x
+.private_extern	_sqrx_382x
+
+.p2align	5
+_sqrx_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rcx
+
+
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%rbx
+	movq	32(%rsi),%rbp
+	movq	40(%rsi),%rdx
+
+	movq	%r14,%r8
+	addq	48(%rsi),%r14
+	movq	%r15,%r9
+	adcq	56(%rsi),%r15
+	movq	%rax,%r10
+	adcq	64(%rsi),%rax
+	movq	%rbx,%r11
+	adcq	72(%rsi),%rbx
+	movq	%rbp,%r12
+	adcq	80(%rsi),%rbp
+	movq	%rdx,%r13
+	adcq	88(%rsi),%rdx
+
+	movq	%r14,0(%rdi)
+	movq	%r15,8(%rdi)
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+	movq	%rbp,32(%rdi)
+	movq	%rdx,40(%rdi)
+
+
+	leaq	48(%rsi),%rdx
+	leaq	48(%rdi),%rdi
+	call	__sub_mod_384_a_is_loaded
+
+
+	leaq	(%rdi),%rsi
+	leaq	-48(%rdi),%rbx
+	leaq	-48(%rdi),%rdi
+	call	__mulx_384
+
+
+	movq	(%rsp),%rsi
+	leaq	48(%rsi),%rbx
+	leaq	96(%rdi),%rdi
+	call	__mulx_384
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	64(%rdi),%rax
+	movq	72(%rdi),%rbx
+	movq	80(%rdi),%rbp
+	addq	%r8,%r8
+	movq	88(%rdi),%rdx
+	adcq	%r9,%r9
+	movq	%r8,0(%rdi)
+	adcq	%r10,%r10
+	movq	%r9,8(%rdi)
+	adcq	%r11,%r11
+	movq	%r10,16(%rdi)
+	adcq	%r12,%r12
+	movq	%r11,24(%rdi)
+	adcq	%r13,%r13
+	movq	%r12,32(%rdi)
+	adcq	%r14,%r14
+	movq	%r13,40(%rdi)
+	adcq	%r15,%r15
+	movq	%r14,48(%rdi)
+	adcq	%rax,%rax
+	movq	%r15,56(%rdi)
+	adcq	%rbx,%rbx
+	movq	%rax,64(%rdi)
+	adcq	%rbp,%rbp
+	movq	%rbx,72(%rdi)
+	adcq	%rdx,%rdx
+	movq	%rbp,80(%rdi)
+	movq	%rdx,88(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*7
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_mulx_384
+.private_extern	_mulx_384
+
+.p2align	5
+_mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	movq	%rdx,%rbx
+	call	__mulx_384
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+.p2align	5
+__mulx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rbx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%r14,%r9,%rcx
+	xorq	%rbp,%rbp
+
+	mulxq	%r15,%r8,%rax
+	adcxq	%rcx,%r8
+	movq	%r9,0(%rdi)
+
+	mulxq	%r10,%r9,%rcx
+	adcxq	%rax,%r9
+
+	mulxq	%r11,%r10,%rax
+	adcxq	%rcx,%r10
+
+	mulxq	%r12,%r11,%rcx
+	adcxq	%rax,%r11
+
+	mulxq	%r13,%r12,%r13
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,8(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	16(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,16(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	24(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,24(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	32(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,32(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	40(%rbx),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	mulxq	%r14,%rax,%rcx
+	adcxq	%r8,%rax
+	adoxq	%rcx,%r9
+	movq	%rax,40(%rdi)
+
+	mulxq	%r15,%r8,%rcx
+	adcxq	%r9,%r8
+	adoxq	%rcx,%r10
+
+	mulxq	128+16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+
+	mulxq	128+24(%rsi),%r10,%rcx
+	adcxq	%r11,%r10
+	adoxq	%rcx,%r12
+
+	mulxq	128+32(%rsi),%r11,%rax
+	adcxq	%r12,%r11
+	adoxq	%r13,%rax
+
+	mulxq	128+40(%rsi),%r12,%r13
+	movq	%rax,%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+	movq	%r8,48(%rdi)
+	movq	%r9,56(%rdi)
+	movq	%r10,64(%rdi)
+	movq	%r11,72(%rdi)
+	movq	%r12,80(%rdi)
+	movq	%r13,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+.globl	_sqrx_384
+.private_extern	_sqrx_384
+
+.p2align	5
+_sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	pushq	%rdi
+.cfi_adjust_cfa_offset	8
+
+
+	call	__sqrx_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__sqrx_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%rcx
+	movq	32(%rsi),%rbx
+
+
+	mulxq	%r14,%r8,%rdi
+	movq	40(%rsi),%rbp
+	mulxq	%r15,%r9,%rax
+	addq	%rdi,%r9
+	mulxq	%rcx,%r10,%rdi
+	adcq	%rax,%r10
+	mulxq	%rbx,%r11,%rax
+	adcq	%rdi,%r11
+	mulxq	%rbp,%r12,%r13
+	movq	%r14,%rdx
+	adcq	%rax,%r12
+	adcq	$0,%r13
+
+
+	xorq	%r14,%r14
+	mulxq	%r15,%rdi,%rax
+	adcxq	%rdi,%r10
+	adoxq	%rax,%r11
+
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r11
+	adoxq	%rax,%r12
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%r15,%rdx
+	adcxq	%rdi,%r13
+	adoxq	%r14,%rax
+	adcxq	%rax,%r14
+
+
+	xorq	%r15,%r15
+	mulxq	%rcx,%rdi,%rax
+	adcxq	%rdi,%r12
+	adoxq	%rax,%r13
+
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r13
+	adoxq	%rax,%r14
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rcx,%rdx
+	adcxq	%rdi,%r14
+	adoxq	%r15,%rax
+	adcxq	%rax,%r15
+
+
+	xorq	%rcx,%rcx
+	mulxq	%rbx,%rdi,%rax
+	adcxq	%rdi,%r14
+	adoxq	%rax,%r15
+
+	mulxq	%rbp,%rdi,%rax
+	movq	%rbx,%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rcx,%rax
+	adcxq	%rax,%rcx
+
+
+	mulxq	%rbp,%rdi,%rbx
+	movq	0(%rsi),%rdx
+	addq	%rdi,%rcx
+	movq	8(%rsp),%rdi
+	adcq	$0,%rbx
+
+
+	xorq	%rbp,%rbp
+	adcxq	%r8,%r8
+	adcxq	%r9,%r9
+	adcxq	%r10,%r10
+	adcxq	%r11,%r11
+	adcxq	%r12,%r12
+
+
+	mulxq	%rdx,%rdx,%rax
+	movq	%rdx,0(%rdi)
+	movq	8(%rsi),%rdx
+	adoxq	%rax,%r8
+	movq	%r8,8(%rdi)
+
+	mulxq	%rdx,%r8,%rax
+	movq	16(%rsi),%rdx
+	adoxq	%r8,%r9
+	adoxq	%rax,%r10
+	movq	%r9,16(%rdi)
+	movq	%r10,24(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	24(%rsi),%rdx
+	adoxq	%r8,%r11
+	adoxq	%r9,%r12
+	adcxq	%r13,%r13
+	adcxq	%r14,%r14
+	movq	%r11,32(%rdi)
+	movq	%r12,40(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	32(%rsi),%rdx
+	adoxq	%r8,%r13
+	adoxq	%r9,%r14
+	adcxq	%r15,%r15
+	adcxq	%rcx,%rcx
+	movq	%r13,48(%rdi)
+	movq	%r14,56(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	movq	40(%rsi),%rdx
+	adoxq	%r8,%r15
+	adoxq	%r9,%rcx
+	adcxq	%rbx,%rbx
+	adcxq	%rbp,%rbp
+	movq	%r15,64(%rdi)
+	movq	%rcx,72(%rdi)
+
+	mulxq	%rdx,%r8,%r9
+	adoxq	%r8,%rbx
+	adoxq	%r9,%rbp
+
+	movq	%rbx,80(%rdi)
+	movq	%rbp,88(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+
+.globl	_redcx_mont_384
+.private_extern	_redcx_mont_384
+
+.p2align	5
+_redcx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+
+
+
+.globl	_fromx_mont_384
+.private_extern	_fromx_mont_384
+
+.p2align	5
+_fromx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rdx,%rbx
+	call	__mulx_by_1_mont_384
+
+
+
+
+	movq	%r14,%rax
+	movq	%r15,%rcx
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulx_by_1_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	%rcx,%rdx
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+	imulq	%r8,%rdx
+
+
+	xorq	%r14,%r14
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r13
+	adoxq	%r14,%rbp
+	adcxq	%rbp,%r14
+	imulq	%r9,%rdx
+
+
+	xorq	%r15,%r15
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%r15,%rbp
+	adcxq	%rbp,%r15
+	imulq	%r10,%rdx
+
+
+	xorq	%r8,%r8
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r15
+	adoxq	%r8,%rbp
+	adcxq	%rbp,%r8
+	imulq	%r11,%rdx
+
+
+	xorq	%r9,%r9
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r8
+	adoxq	%r9,%rbp
+	adcxq	%rbp,%r9
+	imulq	%r12,%rdx
+
+
+	xorq	%r10,%r10
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r9
+	adoxq	%r10,%rbp
+	adcxq	%rbp,%r10
+	imulq	%r13,%rdx
+
+
+	xorq	%r11,%r11
+	mulxq	0(%rbx),%rax,%rbp
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8(%rbx),%rax,%rbp
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16(%rbx),%rax,%rbp
+	adcxq	%rax,%r15
+	adoxq	%rbp,%r8
+
+	mulxq	24(%rbx),%rax,%rbp
+	adcxq	%rax,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	32(%rbx),%rax,%rbp
+	adcxq	%rax,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	40(%rbx),%rax,%rbp
+	movq	%rcx,%rdx
+	adcxq	%rax,%r10
+	adoxq	%r11,%rbp
+	adcxq	%rbp,%r11
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+
+.p2align	5
+__redc_tail_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	addq	48(%rsi),%r14
+	movq	%r14,%rax
+	adcq	56(%rsi),%r15
+	adcq	64(%rsi),%r8
+	adcq	72(%rsi),%r9
+	movq	%r15,%rcx
+	adcq	80(%rsi),%r10
+	adcq	88(%rsi),%r11
+	sbbq	%r12,%r12
+
+
+
+
+	movq	%r8,%rdx
+	movq	%r9,%rbp
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	movq	%r10,%r13
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	movq	%r11,%rsi
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%r12
+
+	cmovcq	%rax,%r14
+	cmovcq	%rcx,%r15
+	cmovcq	%rdx,%r8
+	movq	%r14,0(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r15,8(%rdi)
+	cmovcq	%r13,%r10
+	movq	%r8,16(%rdi)
+	cmovcq	%rsi,%r11
+	movq	%r9,24(%rdi)
+	movq	%r10,32(%rdi)
+	movq	%r11,40(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_sgn0x_pty_mont_384
+.private_extern	_sgn0x_pty_mont_384
+
+.p2align	5
+_sgn0x_pty_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	0(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	xorq	%rax,%rax
+	movq	%r14,%r13
+	addq	%r14,%r14
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r14
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	notq	%rax
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sgn0x_pty_mont_384x
+.private_extern	_sgn0x_pty_mont_384x
+
+.p2align	5
+_sgn0x_pty_mont_384x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$8,%rsp
+.cfi_adjust_cfa_offset	8
+
+
+	movq	%rsi,%rbx
+	leaq	48(%rdi),%rsi
+	movq	%rdx,%rcx
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	leaq	0(%rdi),%rsi
+	xorq	%rdi,%rdi
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rdi
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rdi
+
+	movq	%r14,0(%rsp)
+	notq	%rdi
+	andq	$1,%r13
+	andq	$2,%rdi
+	orq	%r13,%rdi
+
+	call	__mulx_by_1_mont_384
+
+	movq	%r14,%r12
+	orq	%r15,%r14
+	orq	%r8,%r14
+	orq	%r9,%r14
+	orq	%r10,%r14
+	orq	%r11,%r14
+
+	xorq	%rax,%rax
+	movq	%r12,%r13
+	addq	%r12,%r12
+	adcq	%r15,%r15
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	$0,%rax
+
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r15
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	sbbq	32(%rbx),%r10
+	sbbq	40(%rbx),%r11
+	sbbq	$0,%rax
+
+	movq	0(%rsp),%r12
+
+	notq	%rax
+
+	testq	%r14,%r14
+	cmovzq	%rdi,%r13
+
+	testq	%r12,%r12
+	cmovnzq	%rdi,%rax
+
+	andq	$1,%r13
+	andq	$2,%rax
+	orq	%r13,%rax
+
+	movq	8(%rsp),%r15
+.cfi_restore	%r15
+	movq	16(%rsp),%r14
+.cfi_restore	%r14
+	movq	24(%rsp),%r13
+.cfi_restore	%r13
+	movq	32(%rsp),%r12
+.cfi_restore	%r12
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	48(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	56(%rsp),%rsp
+.cfi_adjust_cfa_offset	-56
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_mulx_mont_384
+.private_extern	_mulx_mont_384
+
+.p2align	5
+_mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+	movq	%r8,(%rsp)
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,16(%rsp)
+	imulq	8(%rsp),%r8
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+	adoxq	%rax,%r15
+	adoxq	%rax,%rax
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%r8,%r14
+	adoxq	%r8,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r9,16(%rsp)
+	imulq	8(%rsp),%r9
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+	adoxq	%r8,%rax
+	adoxq	%r8,%r8
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r9,%r15
+	adoxq	%r9,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r10,16(%rsp)
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+	adoxq	%r9,%r8
+	adoxq	%r9,%r9
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r10,%rax
+	adoxq	%r10,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r11,16(%rsp)
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+	adoxq	%r10,%r9
+	adoxq	%r10,%r10
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r11,%r8
+	adoxq	%r11,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	movq	%r12,16(%rsp)
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+	adoxq	%r11,%r10
+	adoxq	%r11,%r11
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	16(%rsp),%rdi
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r9
+	adoxq	%r12,%r10
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	adcxq	%r12,%r11
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	movq	%r15,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	movq	%rax,%rsi
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	movq	%r14,%rdx
+	adcxq	%r12,%r10
+	adoxq	%r12,%r11
+	leaq	128(%rcx),%rcx
+	movq	%r8,%r12
+	adcq	$0,%r11
+
+
+
+
+	subq	0(%rcx),%r14
+	sbbq	8(%rcx),%r15
+	movq	%r9,%rdi
+	sbbq	16(%rcx),%rax
+	sbbq	24(%rcx),%r8
+	sbbq	32(%rcx),%r9
+	movq	%r10,%rbp
+	sbbq	40(%rcx),%r10
+	sbbq	$0,%r11
+
+	cmovncq	%r14,%rdx
+	cmovcq	%r13,%r15
+	cmovcq	%rsi,%rax
+	cmovncq	%r8,%r12
+	movq	%rdx,0(%rbx)
+	cmovncq	%r9,%rdi
+	movq	%r15,8(%rbx)
+	cmovncq	%r10,%rbp
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqrx_mont_384
+.private_extern	_sqrx_mont_384
+
+.p2align	5
+_sqrx_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-24(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*3
+
+
+	movq	%rcx,%r8
+	leaq	-128(%rdx),%rcx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	leaq	(%rsi),%rbx
+	movq	%r8,(%rsp)
+	leaq	-128(%rsi),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	24(%rsp),%r15
+.cfi_restore	%r15
+	movq	32(%rsp),%r14
+.cfi_restore	%r14
+	movq	40(%rsp),%r13
+.cfi_restore	%r13
+	movq	48(%rsp),%r12
+.cfi_restore	%r12
+	movq	56(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	64(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	72(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sqrx_n_mul_mont_384
+.private_extern	_sqrx_n_mul_mont_384
+
+.p2align	5
+_sqrx_n_mul_mont_384:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+
+L$oop_sqrx_384:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_384
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	L$oop_sqrx_384
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.globl	_sqrx_n_mul_mont_383
+.private_extern	_sqrx_n_mul_mont_383
+
+.p2align	5
+_sqrx_n_mul_mont_383:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	leaq	-40(%rsp),%rsp
+.cfi_adjust_cfa_offset	8*5
+
+
+	movq	%rdx,%r10
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	%rsi,%rbx
+	movq	24(%rsi),%r12
+	movq	%rdi,16(%rsp)
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+
+	movq	%r8,(%rsp)
+	movq	%r9,24(%rsp)
+	movq	0(%r9),%xmm2
+	leaq	-128(%rcx),%rcx
+
+L$oop_sqrx_383:
+	movd	%r10d,%xmm1
+	leaq	-128(%rbx),%rsi
+
+	mulxq	%rdx,%r8,%r9
+	call	__mulx_mont_383_nonred
+
+	movd	%xmm1,%r10d
+	decl	%r10d
+	jnz	L$oop_sqrx_383
+
+	movq	%rdx,%r14
+.byte	102,72,15,126,210
+	leaq	-128(%rbx),%rsi
+	movq	24(%rsp),%rbx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_384
+
+	movq	40(%rsp),%r15
+.cfi_restore	%r15
+	movq	48(%rsp),%r14
+.cfi_restore	%r14
+	movq	56(%rsp),%r13
+.cfi_restore	%r13
+	movq	64(%rsp),%r12
+.cfi_restore	%r12
+	movq	72(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	80(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	88(%rsp),%rsp
+.cfi_adjust_cfa_offset	-8*11
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	5
+__mulx_mont_383_nonred:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	mulxq	%r15,%r14,%r10
+	mulxq	%rax,%r15,%r11
+	addq	%r14,%r9
+	mulxq	%r12,%rax,%r12
+	adcq	%r15,%r10
+	mulxq	%rdi,%rdi,%r13
+	adcq	%rax,%r11
+	mulxq	%rbp,%rbp,%r14
+	movq	8(%rbx),%rdx
+	adcq	%rdi,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r14
+	movq	%r8,%rax
+	imulq	8(%rsp),%r8
+
+
+	xorq	%r15,%r15
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r9
+	adcxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r8,%rdx
+	adoxq	%rdi,%r14
+	adcxq	%r15,%rbp
+	adoxq	%rbp,%r15
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rax,%r14
+	adoxq	%rax,%r15
+	adcxq	%rax,%r15
+	movq	%r9,%r8
+	imulq	8(%rsp),%r9
+
+
+	xorq	%rax,%rax
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r10
+	adcxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r9,%rdx
+	adoxq	%rdi,%r15
+	adcxq	%rax,%rbp
+	adoxq	%rbp,%rax
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+	adcxq	%r8,%r15
+	adoxq	%r8,%rax
+	adcxq	%r8,%rax
+	movq	%r10,%r9
+	imulq	8(%rsp),%r10
+
+
+	xorq	%r8,%r8
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r11
+	adcxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r10,%rdx
+	adoxq	%rdi,%rax
+	adcxq	%r8,%rbp
+	adoxq	%rbp,%r8
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	32(%rbx),%rdx
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+	adcxq	%r9,%rax
+	adoxq	%r9,%r8
+	adcxq	%r9,%r8
+	movq	%r11,%r10
+	imulq	8(%rsp),%r11
+
+
+	xorq	%r9,%r9
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r12
+	adcxq	%rbp,%r13
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r11,%rdx
+	adoxq	%rdi,%r8
+	adcxq	%r9,%rbp
+	adoxq	%rbp,%r9
+
+
+	xorq	%r11,%r11
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r10
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	40(%rbx),%rdx
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcxq	%r10,%r9
+	movq	%r12,%r11
+	imulq	8(%rsp),%r12
+
+
+	xorq	%r10,%r10
+	mulxq	0+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	8+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r14
+	adcxq	%rbp,%r15
+
+	mulxq	16+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r15
+	adcxq	%rbp,%rax
+
+	mulxq	24+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%rax
+	adcxq	%rbp,%r8
+
+	mulxq	32+128(%rsi),%rdi,%rbp
+	adoxq	%rdi,%r8
+	adcxq	%rbp,%r9
+
+	mulxq	40+128(%rsi),%rdi,%rbp
+	movq	%r12,%rdx
+	adoxq	%rdi,%r9
+	adcxq	%r10,%rbp
+	adoxq	%rbp,%r10
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r11
+	adoxq	%rbp,%r13
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r13,%rdx
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcxq	%r11,%r10
+	imulq	8(%rsp),%rdx
+	movq	24(%rsp),%rbx
+
+
+	xorq	%r12,%r12
+	mulxq	0+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r13
+	adoxq	%rbp,%r14
+
+	mulxq	8+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r14
+	adoxq	%rbp,%r15
+
+	mulxq	16+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r15
+	adoxq	%rbp,%rax
+
+	mulxq	24+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%rax
+	adoxq	%rbp,%r8
+
+	mulxq	32+128(%rcx),%rdi,%rbp
+	adcxq	%rdi,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	40+128(%rcx),%rdi,%rbp
+	movq	%r14,%rdx
+	adcxq	%rdi,%r9
+	adoxq	%rbp,%r10
+	adcq	$0,%r10
+	movq	%r8,%r12
+
+	movq	%r14,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r9,%rdi
+	movq	%r8,24(%rbx)
+	movq	%r9,32(%rbx)
+	movq	%r10,40(%rbx)
+	movq	%r10,%rbp
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_sqrx_mont_382x
+.private_extern	_sqrx_mont_382x
+
+.p2align	5
+_sqrx_mont_382x:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$136,%rsp
+.cfi_adjust_cfa_offset	136
+
+
+	movq	%rcx,0(%rsp)
+	movq	%rdx,%rcx
+	movq	%rdi,16(%rsp)
+	movq	%rsi,24(%rsp)
+
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	32(%rsi),%r12
+	movq	40(%rsi),%r13
+
+	movq	%r8,%r14
+	addq	48(%rsi),%r8
+	movq	%r9,%r15
+	adcq	56(%rsi),%r9
+	movq	%r10,%rax
+	adcq	64(%rsi),%r10
+	movq	%r11,%rdx
+	adcq	72(%rsi),%r11
+	movq	%r12,%rbx
+	adcq	80(%rsi),%r12
+	movq	%r13,%rbp
+	adcq	88(%rsi),%r13
+
+	subq	48(%rsi),%r14
+	sbbq	56(%rsi),%r15
+	sbbq	64(%rsi),%rax
+	sbbq	72(%rsi),%rdx
+	sbbq	80(%rsi),%rbx
+	sbbq	88(%rsi),%rbp
+	sbbq	%rdi,%rdi
+
+	movq	%r8,32+0(%rsp)
+	movq	%r9,32+8(%rsp)
+	movq	%r10,32+16(%rsp)
+	movq	%r11,32+24(%rsp)
+	movq	%r12,32+32(%rsp)
+	movq	%r13,32+40(%rsp)
+
+	movq	%r14,32+48(%rsp)
+	movq	%r15,32+56(%rsp)
+	movq	%rax,32+64(%rsp)
+	movq	%rdx,32+72(%rsp)
+	movq	%rbx,32+80(%rsp)
+	movq	%rbp,32+88(%rsp)
+	movq	%rdi,32+96(%rsp)
+
+
+
+	leaq	48(%rsi),%rbx
+
+	movq	48(%rsi),%rdx
+	movq	0(%rsi),%r14
+	movq	8(%rsi),%r15
+	movq	16(%rsi),%rax
+	movq	24(%rsi),%r12
+	movq	32(%rsi),%rdi
+	movq	40(%rsi),%rbp
+	leaq	-128(%rsi),%rsi
+	leaq	-128(%rcx),%rcx
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	addq	%rdx,%rdx
+	adcq	%r15,%r15
+	adcq	%rax,%rax
+	adcq	%r12,%r12
+	adcq	%rdi,%rdi
+	adcq	%rbp,%rbp
+
+	movq	%rdx,48(%rbx)
+	movq	%r15,56(%rbx)
+	movq	%rax,64(%rbx)
+	movq	%r12,72(%rbx)
+	movq	%rdi,80(%rbx)
+	movq	%rbp,88(%rbx)
+
+	leaq	32-128(%rsp),%rsi
+	leaq	32+48(%rsp),%rbx
+
+	movq	32+48(%rsp),%rdx
+	movq	32+0(%rsp),%r14
+	movq	32+8(%rsp),%r15
+	movq	32+16(%rsp),%rax
+	movq	32+24(%rsp),%r12
+	movq	32+32(%rsp),%rdi
+	movq	32+40(%rsp),%rbp
+
+
+
+	mulxq	%r14,%r8,%r9
+	call	__mulx_mont_383_nonred
+	movq	32+96(%rsp),%r14
+	leaq	128(%rcx),%rcx
+	movq	32+0(%rsp),%r8
+	andq	%r14,%r8
+	movq	32+8(%rsp),%r9
+	andq	%r14,%r9
+	movq	32+16(%rsp),%r10
+	andq	%r14,%r10
+	movq	32+24(%rsp),%r11
+	andq	%r14,%r11
+	movq	32+32(%rsp),%r13
+	andq	%r14,%r13
+	andq	32+40(%rsp),%r14
+
+	subq	%r8,%rdx
+	movq	0(%rcx),%r8
+	sbbq	%r9,%r15
+	movq	8(%rcx),%r9
+	sbbq	%r10,%rax
+	movq	16(%rcx),%r10
+	sbbq	%r11,%r12
+	movq	24(%rcx),%r11
+	sbbq	%r13,%rdi
+	movq	32(%rcx),%r13
+	sbbq	%r14,%rbp
+	sbbq	%r14,%r14
+
+	andq	%r14,%r8
+	andq	%r14,%r9
+	andq	%r14,%r10
+	andq	%r14,%r11
+	andq	%r14,%r13
+	andq	40(%rcx),%r14
+
+	addq	%r8,%rdx
+	adcq	%r9,%r15
+	adcq	%r10,%rax
+	adcq	%r11,%r12
+	adcq	%r13,%rdi
+	adcq	%r14,%rbp
+
+	movq	%rdx,0(%rbx)
+	movq	%r15,8(%rbx)
+	movq	%rax,16(%rbx)
+	movq	%r12,24(%rbx)
+	movq	%rdi,32(%rbx)
+	movq	%rbp,40(%rbx)
+	leaq	136(%rsp),%r8
+	movq	0(%r8),%r15
+.cfi_restore	%r15
+	movq	8(%r8),%r14
+.cfi_restore	%r14
+	movq	16(%r8),%r13
+.cfi_restore	%r13
+	movq	24(%r8),%r12
+.cfi_restore	%r12
+	movq	32(%r8),%rbx
+.cfi_restore	%rbx
+	movq	40(%r8),%rbp
+.cfi_restore	%rbp
+	leaq	48(%r8),%rsp
+.cfi_adjust_cfa_offset	-136-8*6
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S
new file mode 100644
index 00000000000..c928f75025f
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/sha256-armv8.S
@@ -0,0 +1,1077 @@
+//
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+// project.
+// ====================================================================
+//
+// sha256_block procedure for ARMv8.
+//
+// This module is stripped of scalar code paths, with raionale that all
+// known processors are NEON-capable.
+//
+// See original module at CRYPTOGAMS for further details.
+
+.text
+
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.align	2
+.align	2
+.globl	_blst_sha256_block_armv8
+
+.align	6
+_blst_sha256_block_armv8:
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,LK256
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.globl	_blst_sha256_block_data_order
+
+.align	4
+_blst_sha256_block_data_order:
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	x16,LK256
+	add	x2,x1,x2,lsl#6	// len to point at the end of inp
+
+	ld1	{v0.16b},[x1], #16
+	ld1	{v1.16b},[x1], #16
+	ld1	{v2.16b},[x1], #16
+	ld1	{v3.16b},[x1], #16
+	ld1	{v4.4s},[x16], #16
+	ld1	{v5.4s},[x16], #16
+	ld1	{v6.4s},[x16], #16
+	ld1	{v7.4s},[x16], #16
+	rev32	v0.16b,v0.16b		// yes, even on
+	rev32	v1.16b,v1.16b		// big-endian
+	rev32	v2.16b,v2.16b
+	rev32	v3.16b,v3.16b
+	mov	x17,sp
+	add	v4.4s,v4.4s,v0.4s
+	add	v5.4s,v5.4s,v1.4s
+	add	v6.4s,v6.4s,v2.4s
+	st1	{v4.4s,v5.4s},[x17], #32
+	add	v7.4s,v7.4s,v3.4s
+	st1	{v6.4s,v7.4s},[x17]
+	sub	x17,x17,#32
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#8]
+	ldp	w7,w8,[x0,#16]
+	ldp	w9,w10,[x0,#24]
+	ldr	w12,[sp,#0]
+	mov	w13,wzr
+	eor	w14,w4,w5
+	mov	w15,wzr
+	b	L_00_48
+
+.align	4
+L_00_48:
+	ext	v4.16b,v0.16b,v1.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v2.16b,v3.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v3.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v0.4s,v0.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v0.4s,v0.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v0.4s,v0.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v0.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v0.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v0.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v0.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v0.4s,#13
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v0.4s,v0.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v0.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v1.16b,v2.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v3.16b,v0.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v0.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v1.4s,v1.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v1.4s,v1.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v1.4s,v1.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v1.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v1.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v1.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v1.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v1.4s,#13
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v1.4s,v1.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v1.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v2.16b,v3.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext	v7.16b,v0.16b,v1.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v1.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v2.4s,v2.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v2.4s,v2.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v2.4s,v2.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v2.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v2.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v2.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v2.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v2.4s,#13
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v2.4s,v2.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v2.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext	v4.16b,v3.16b,v0.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext	v7.16b,v1.16b,v2.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v2.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v3.4s,v3.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v3.4s,v3.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v3.4s,v3.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v3.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v3.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v3.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v3.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v3.4s,#13
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v3.4s,v3.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v3.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[x16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	cmp	w12,#0				// check for K256 terminator
+	ldr	w12,[sp,#0]
+	sub	x17,x17,#64
+	bne	L_00_48
+
+	sub	x16,x16,#256		// rewind x16
+	cmp	x1,x2
+	mov	x17, #64
+	csel	x17, x17, xzr, eq
+	sub	x1,x1,x17			// avoid SEGV
+	mov	x17,sp
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v0.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v0.16b,v0.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v0.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v1.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v1.16b,v1.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v1.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v2.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v2.16b,v2.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v2.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v3.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v3.16b,v3.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v3.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w3,w3,w15			// h+=Sigma0(a) from the past
+	ldp	w11,w12,[x0,#0]
+	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
+	ldp	w13,w14,[x0,#8]
+	add	w3,w3,w11			// accumulate
+	add	w4,w4,w12
+	ldp	w11,w12,[x0,#16]
+	add	w5,w5,w13
+	add	w6,w6,w14
+	ldp	w13,w14,[x0,#24]
+	add	w7,w7,w11
+	add	w8,w8,w12
+	ldr	w12,[sp,#0]
+	stp	w3,w4,[x0,#0]
+	add	w9,w9,w13
+	mov	w13,wzr
+	stp	w5,w6,[x0,#8]
+	add	w10,w10,w14
+	stp	w7,w8,[x0,#16]
+	eor	w14,w4,w5
+	stp	w9,w10,[x0,#24]
+	mov	w15,wzr
+	mov	x17,sp
+	b.ne	L_00_48
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+
+.globl	_blst_sha256_emit
+.private_extern	_blst_sha256_emit
+
+.align	4
+_blst_sha256_emit:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[x0,#4]
+	lsr	x4,x4,#32
+	str	w5,[x0,#12]
+	lsr	x5,x5,#32
+	str	w6,[x0,#20]
+	lsr	x6,x6,#32
+	str	w7,[x0,#28]
+	lsr	x7,x7,#32
+	str	w4,[x0,#0]
+	str	w5,[x0,#8]
+	str	w6,[x0,#16]
+	str	w7,[x0,#24]
+	ret
+
+
+.globl	_blst_sha256_bcopy
+.private_extern	_blst_sha256_bcopy
+
+.align	4
+_blst_sha256_bcopy:
+Loop_bcopy:
+	ldrb	w3,[x1],#1
+	sub	x2,x2,#1
+	strb	w3,[x0],#1
+	cbnz	x2,Loop_bcopy
+	ret
+
+
+.globl	_blst_sha256_hcopy
+.private_extern	_blst_sha256_hcopy
+
+.align	4
+_blst_sha256_hcopy:
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+	ret
+
diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
new file mode 100644
index 00000000000..3f000720d00
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
@@ -0,0 +1,1746 @@
+.text	
+
+.globl	_blst_sha256_block_data_order
+
+.p2align	4
+_blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	16*4+3*8
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	L$loop
+
+.p2align	4
+L$loop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	0(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	4(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	8(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	12(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	16(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	20(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	24(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	28(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	32(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	36(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	40(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	44(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	48(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	52(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	56(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	60(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	jmp	L$rounds_16_xx
+.p2align	4
+L$rounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	64(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	68(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	72(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	76(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	80(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	84(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	88(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	92(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	96(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	100(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	104(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	108(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	112(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	116(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	120(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	124(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+	leaq	64(%rbp),%rbp
+	cmpb	$0x19,3(%rbp)
+	jnz	L$rounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop
+
+	leaq	64+24+48(%rsp),%r11
+.cfi_def_cfa	%r11,8
+	movq	64+24(%rsp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	_blst_sha256_emit
+.private_extern	_blst_sha256_emit
+
+.p2align	4
+_blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_blst_sha256_bcopy
+.private_extern	_blst_sha256_bcopy
+
+.p2align	4
+_blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+L$oop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	L$oop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_blst_sha256_hcopy
+.private_extern	_blst_sha256_hcopy
+
+.p2align	4
+_blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s
new file mode 100644
index 00000000000..dee75e35362
--- /dev/null
+++ b/crypto/blst_src/build/mach-o/sha256-x86_64.s
@@ -0,0 +1,1438 @@
+.text	
+
+.p2align	6
+
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
+.globl	_blst_sha256_block_data_order_shaext
+.private_extern	_blst_sha256_block_data_order_shaext
+
+.p2align	6
+_blst_sha256_block_data_order_shaext:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	256-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	L$oop_shaext
+
+.p2align	4
+L$oop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	16-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	48-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	80-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	112-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	144-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	176-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	208-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	240-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	L$oop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_blst_sha256_block_data_order
+.private_extern	_blst_sha256_block_data_order
+
+.p2align	6
+_blst_sha256_block_data_order:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$40,%rsp
+.cfi_adjust_cfa_offset	40
+	leaq	(%rsi,%rdx,4),%rdx
+	movq	%rdi,0(%rsp)
+
+	movq	%rdx,16(%rsp)
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+
+
+	leaq	-64(%rsp),%rsp
+	movl	0(%rdi),%eax
+	andq	$-64,%rsp
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	L$loop_ssse3
+.p2align	4
+L$loop_ssse3:
+	movdqa	K256+256(%rip),%xmm7
+	movq	%rsi,8(%rbp)
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rsi
+.byte	102,15,56,0,207
+	movdqa	0(%rsi),%xmm4
+	movdqa	16(%rsi),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	32(%rsi),%xmm6
+.byte	102,15,56,0,223
+	movdqa	48(%rsi),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	L$ssse3_00_47
+
+.p2align	4
+L$ssse3_00_47:
+	subq	$-64,%rsi
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	16(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	32(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	48(%rsi),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,67(%rsi)
+	jne	L$ssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	0(%rbp),%rdi
+	movl	%r14d,%eax
+	movq	8(%rbp),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	leaq	64(%rsi),%rsi
+	cmpq	16(%rbp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	L$loop_ssse3
+
+	xorps	%xmm0,%xmm0
+	leaq	40+48(%rbp),%r11
+.cfi_def_cfa	%r11,8
+	movaps	%xmm0,0(%rsp)
+	movaps	%xmm0,16(%rsp)
+	movaps	%xmm0,32(%rsp)
+	movaps	%xmm0,48(%rsp)
+	movq	40(%rbp),%r15
+.cfi_restore	%r15
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbx
+.cfi_restore	%rbx
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+
+	leaq	(%r11),%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.globl	_blst_sha256_emit
+.private_extern	_blst_sha256_emit
+
+.p2align	4
+_blst_sha256_emit:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	bswapq	%r8
+	movq	24(%rsi),%r11
+	bswapq	%r9
+	movl	%r8d,4(%rdi)
+	bswapq	%r10
+	movl	%r9d,12(%rdi)
+	bswapq	%r11
+	movl	%r10d,20(%rdi)
+	shrq	$32,%r8
+	movl	%r11d,28(%rdi)
+	shrq	$32,%r9
+	movl	%r8d,0(%rdi)
+	shrq	$32,%r10
+	movl	%r9d,8(%rdi)
+	shrq	$32,%r11
+	movl	%r10d,16(%rdi)
+	movl	%r11d,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_blst_sha256_bcopy
+.private_extern	_blst_sha256_bcopy
+
+.p2align	4
+_blst_sha256_bcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	subq	%rsi,%rdi
+L$oop_bcopy:
+	movzbl	(%rsi),%eax
+	leaq	1(%rsi),%rsi
+	movb	%al,-1(%rdi,%rsi,1)
+	decq	%rdx
+	jnz	L$oop_bcopy
+	.byte	0xf3,0xc3
+.cfi_endproc
+
+
+.globl	_blst_sha256_hcopy
+.private_extern	_blst_sha256_hcopy
+
+.p2align	4
+_blst_sha256_hcopy:
+.cfi_startproc
+	.byte	0xf3,0x0f,0x1e,0xfa
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%r9
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+
diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh
new file mode 100755
index 00000000000..e8c8137c287
--- /dev/null
+++ b/crypto/blst_src/build/refresh.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+
+HERE=`dirname $0`
+cd "${HERE}"
+
+PERL=${PERL:-perl}
+
+for pl in ../src/asm/*-x86_64.pl; do
+    s=`basename $pl .pl`.asm
+    expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s)
+    s=`basename $pl .pl`.s
+    (set -x; ${PERL} $pl elf > elf/$s)
+    (set -x; ${PERL} $pl mingw64 > coff/$s)
+    (set -x; ${PERL} $pl macosx > mach-o/$s)
+done
+
+for pl in ../src/asm/*-armv8.pl; do
+    s=`basename $pl .pl`.asm
+    (set -x; ${PERL} $pl win64 > win64/$s)
+    s=`basename $pl .pl`.S
+    (set -x; ${PERL} $pl linux64 > elf/$s)
+    (set -x; ${PERL} $pl coff64 > coff/$s)
+    (set -x; ${PERL} $pl ios64 > mach-o/$s)
+done
+
+( cd ../bindings;
+  echo "LIBRARY blst"
+  echo
+  echo "EXPORTS"
+  cc -E blst.h | \
+  ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) &&  print "\t$1\n" }'
+  echo
+) > win64/blst.def
+
+if which bindgen > /dev/null 2>&1; then
+  ( cd ../bindings; set -x;
+    bindgen --opaque-type blst_pairing \
+            --opaque-type blst_uniq \
+            --with-derive-default \
+            --with-derive-eq \
+            --size_t-is-usize \
+            --rustified-enum BLST.\* \
+        blst.h -- -D__BLST_RUST_BINDGEN__ \
+    | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs
+  )
+else
+    echo "Install Rust bindgen with 'cargo install bindgen'" 1>&2
+    exit 1
+fi
diff --git a/crypto/blst_src/build/win64/add_mod_256-armv8.asm b/crypto/blst_src/build/win64/add_mod_256-armv8.asm
new file mode 100644
index 00000000000..8d6975185a6
--- /dev/null
+++ b/crypto/blst_src/build/win64/add_mod_256-armv8.asm
@@ -0,0 +1,380 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+
+	EXPORT	|add_mod_256|[FUNC]
+	ALIGN	32
+|add_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	adds	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	adcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	adcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csello	x8,x8,x16
+	csello	x9,x9,x17
+	csello	x10,x10,x1
+	stp	x8,x9,[x0]
+	csello	x11,x11,x2
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_3_mod_256|[FUNC]
+	ALIGN	32
+|mul_by_3_mod_256| PROC
+	ldp	x12,x13,[x1]
+	ldp	x14,x15,[x1,#16]
+
+	adds	x8,x12,x12
+	ldp	x4,x5,[x2]
+	adcs	x9,x13,x13
+	ldp	x6,x7,[x2,#16]
+	adcs	x10,x14,x14
+	adcs	x11,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csello	x8,x8,x16
+	csello	x9,x9,x17
+	csello	x10,x10,x1
+	csello	x11,x11,x2
+
+	adds	x8,x8,x12
+	adcs	x9,x9,x13
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csello	x8,x8,x16
+	csello	x9,x9,x17
+	csello	x10,x10,x1
+	stp	x8,x9,[x0]
+	csello	x11,x11,x2
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|lshift_mod_256|[FUNC]
+	ALIGN	32
+|lshift_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+|$Loop_lshift_mod_256|
+	adds	x8,x8,x8
+	sub	x2,x2,#1
+	adcs	x9,x9,x9
+	adcs	x10,x10,x10
+	adcs	x11,x11,x11
+	adc	x3,xzr,xzr
+
+	subs	x12,x8,x4
+	sbcs	x13,x9,x5
+	sbcs	x14,x10,x6
+	sbcs	x15,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csello	x8,x8,x12
+	csello	x9,x9,x13
+	csello	x10,x10,x14
+	csello	x11,x11,x15
+
+	cbnz	x2,|$Loop_lshift_mod_256|
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|rshift_mod_256|[FUNC]
+	ALIGN	32
+|rshift_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x10,x11,[x1,#16]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+
+|$Loop_rshift|
+	adds	x12,x8,x4
+	sub	x2,x2,#1
+	adcs	x13,x9,x5
+	adcs	x14,x10,x6
+	adcs	x15,x11,x7
+	adc	x3,xzr,xzr
+	tst	x8,#1
+
+	cselne	x12,x12,x8
+	cselne	x13,x13,x9
+	cselne	x14,x14,x10
+	cselne	x15,x15,x11
+	cselne	x3,x3,xzr
+
+	extr	x8,x13,x12,#1
+	extr	x9,x14,x13,#1
+	extr	x10,x15,x14,#1
+	extr	x11,x3,x15,#1
+
+	cbnz	x2,|$Loop_rshift|
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|cneg_mod_256|[FUNC]
+	ALIGN	32
+|cneg_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x4,x5,[x3]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x12,x4,x8
+	ldp	x6,x7,[x3,#16]
+	orr	x4,x8,x9
+	sbcs	x13,x5,x9
+	orr	x5,x10,x11
+	sbcs	x14,x6,x10
+	orr	x3,x4,x5
+	sbc	x15,x7,x11
+
+	cmp	x3,#0
+	csetmne	x3
+	ands	x2,x2,x3
+
+	cseleq	x8,x8,x12
+	cseleq	x9,x9,x13
+	cseleq	x10,x10,x14
+	stp	x8,x9,[x0]
+	cseleq	x11,x11,x15
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|sub_mod_256|[FUNC]
+	ALIGN	32
+|sub_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+
+	ldp	x10,x11,[x1,#16]
+	subs	x8,x8,x12
+	ldp	x14,x15,[x2,#16]
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	stp	x8,x9,[x0]
+	adc	x11,x11,x7
+	stp	x10,x11,[x0,#16]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|check_mod_256|[FUNC]
+	ALIGN	32
+|check_mod_256| PROC
+	ldp	x8,x9,[x0]
+	ldp	x10,x11,[x0,#16]
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	subs	xzr,x8,x4
+	sbcs	xzr,x9,x5
+	orr	x8,x8,x9
+	sbcs	xzr,x10,x6
+	orr	x8,x8,x10
+	sbcs	xzr,x11,x7
+	orr	x8,x8,x11
+	sbc	x1,xzr,xzr
+
+	cmp	x8,#0
+	mov	x0,#1
+	cselne	x0,x0,xzr
+	and	x0,x0,x1
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|add_n_check_mod_256|[FUNC]
+	ALIGN	32
+|add_n_check_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	adds	x8,x8,x12
+	ldp	x4,x5,[x3]
+	adcs	x9,x9,x13
+	ldp	x6,x7,[x3,#16]
+	adcs	x10,x10,x14
+	adcs	x11,x11,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x8,x4
+	sbcs	x17,x9,x5
+	sbcs	x1,x10,x6
+	sbcs	x2,x11,x7
+	sbcs	xzr,x3,xzr
+
+	csello	x8,x8,x16
+	csello	x9,x9,x17
+	csello	x10,x10,x1
+	csello	x11,x11,x2
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	cselne	x0,x17,xzr
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|sub_n_check_mod_256|[FUNC]
+	ALIGN	32
+|sub_n_check_mod_256| PROC
+	ldp	x8,x9,[x1]
+	ldp	x12,x13,[x2]
+	ldp	x10,x11,[x1,#16]
+	ldp	x14,x15,[x2,#16]
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x12,x12
+	rev	x9,x9
+	rev	x13,x13
+	rev	x10,x10
+	rev	x14,x14
+	rev	x11,x11
+	rev	x15,x15
+#endif
+
+	subs	x8,x8,x12
+	sbcs	x9,x9,x13
+	ldp	x4,x5,[x3]
+	sbcs	x10,x10,x14
+	ldp	x6,x7,[x3,#16]
+	sbcs	x11,x11,x15
+	sbc	x3,xzr,xzr
+
+	and	x4,x4,x3
+	and	x5,x5,x3
+	adds	x8,x8,x4
+	and	x6,x6,x3
+	adcs	x9,x9,x5
+	and	x7,x7,x3
+	adcs	x10,x10,x6
+	adc	x11,x11,x7
+
+	orr	x16, x8, x9
+	orr	x17, x10, x11
+	orr	x16, x16, x17
+
+#ifdef	__AARCH64EB__
+	rev	x8,x8
+	rev	x9,x9
+	rev	x10,x10
+	rev	x11,x11
+#endif
+
+	stp	x8,x9,[x0]
+	stp	x10,x11,[x0,#16]
+
+	mov	x17, #1
+	cmp	x16, #0
+	cselne	x0,x17,xzr
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm
new file mode 100644
index 00000000000..09a5c17975d
--- /dev/null
+++ b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm
@@ -0,0 +1,934 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	add_mod_256
+
+
+ALIGN	32
+add_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_add_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_add_mod_256::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+$L$oaded_a_add_mod_256::
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	mov	rax,r8
+	adc	r10,QWORD PTR[16+rdx]
+	mov	rsi,r9
+	adc	r11,QWORD PTR[24+rdx]
+	sbb	rdx,rdx
+
+	mov	rbx,r10
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rcx]
+	mov	rbp,r11
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,rax
+	cmovc	r9,rsi
+	mov	QWORD PTR[rdi],r8
+	cmovc	r10,rbx
+	mov	QWORD PTR[8+rdi],r9
+	cmovc	r11,rbp
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_add_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_add_mod_256::
+add_mod_256	ENDP
+
+
+PUBLIC	mul_by_3_mod_256
+
+
+ALIGN	32
+mul_by_3_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_3_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+$L$SEH_body_mul_by_3_mod_256::
+
+
+	mov	rcx,rdx
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	rdx,rsi
+	mov	r11,QWORD PTR[24+rsi]
+
+	call	__lshift_mod_256
+	mov	r12,QWORD PTR[rsp]
+
+	jmp	$L$oaded_a_add_mod_256
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_mul_by_3_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_3_mod_256::
+mul_by_3_mod_256	ENDP
+
+
+ALIGN	32
+__lshift_mod_256	PROC PRIVATE
+	DB	243,15,30,250
+	add	r8,r8
+	adc	r9,r9
+	mov	rax,r8
+	adc	r10,r10
+	mov	rsi,r9
+	adc	r11,r11
+	sbb	r12,r12
+
+	mov	rbx,r10
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rcx]
+	mov	rbp,r11
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,0
+
+	cmovc	r8,rax
+	cmovc	r9,rsi
+	cmovc	r10,rbx
+	cmovc	r11,rbp
+
+	DB	0F3h,0C3h		;repret
+__lshift_mod_256	ENDP
+
+
+PUBLIC	lshift_mod_256
+
+
+ALIGN	32
+lshift_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_lshift_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+$L$SEH_body_lshift_mod_256::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+$L$oop_lshift_mod_256::
+	call	__lshift_mod_256
+	dec	edx
+	jnz	$L$oop_lshift_mod_256
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	mov	r12,QWORD PTR[rsp]
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_lshift_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_lshift_mod_256::
+lshift_mod_256	ENDP
+
+
+PUBLIC	rshift_mod_256
+
+
+ALIGN	32
+rshift_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_rshift_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_rshift_mod_256::
+
+
+	mov	rbp,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+$L$oop_rshift_mod_256::
+	mov	r8,rbp
+	and	rbp,1
+	mov	rax,QWORD PTR[rcx]
+	neg	rbp
+	mov	rsi,QWORD PTR[8+rcx]
+	mov	rbx,QWORD PTR[16+rcx]
+
+	and	rax,rbp
+	and	rsi,rbp
+	and	rbx,rbp
+	and	rbp,QWORD PTR[24+rcx]
+
+	add	r8,rax
+	adc	r9,rsi
+	adc	r10,rbx
+	adc	r11,rbp
+	sbb	rax,rax
+
+	shr	r8,1
+	mov	rbp,r9
+	shr	r9,1
+	mov	rbx,r10
+	shr	r10,1
+	mov	rsi,r11
+	shr	r11,1
+
+	shl	rbp,63
+	shl	rbx,63
+	or	rbp,r8
+	shl	rsi,63
+	or	r9,rbx
+	shl	rax,63
+	or	r10,rsi
+	or	r11,rax
+
+	dec	edx
+	jnz	$L$oop_rshift_mod_256
+
+	mov	QWORD PTR[rdi],rbp
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_rshift_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_rshift_mod_256::
+rshift_mod_256	ENDP
+
+
+PUBLIC	cneg_mod_256
+
+
+ALIGN	32
+cneg_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_cneg_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+$L$SEH_body_cneg_mod_256::
+
+
+	mov	r12,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r8,r12
+	mov	r11,QWORD PTR[24+rsi]
+	or	r12,r9
+	or	r12,r10
+	or	r12,r11
+	mov	rbp,-1
+
+	mov	rax,QWORD PTR[rcx]
+	cmovnz	r12,rbp
+	mov	rsi,QWORD PTR[8+rcx]
+	mov	rbx,QWORD PTR[16+rcx]
+	and	rax,r12
+	mov	rbp,QWORD PTR[24+rcx]
+	and	rsi,r12
+	and	rbx,r12
+	and	rbp,r12
+
+	sub	rax,r8
+	sbb	rsi,r9
+	sbb	rbx,r10
+	sbb	rbp,r11
+
+	or	rdx,rdx
+
+	cmovz	rax,r8
+	cmovz	rsi,r9
+	mov	QWORD PTR[rdi],rax
+	cmovz	rbx,r10
+	mov	QWORD PTR[8+rdi],rsi
+	cmovz	rbp,r11
+	mov	QWORD PTR[16+rdi],rbx
+	mov	QWORD PTR[24+rdi],rbp
+
+	mov	r12,QWORD PTR[rsp]
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_cneg_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_cneg_mod_256::
+cneg_mod_256	ENDP
+
+
+PUBLIC	sub_mod_256
+
+
+ALIGN	32
+sub_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sub_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_sub_mod_256::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	rax,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	rsi,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[24+rcx]
+	sbb	rdx,rdx
+
+	and	rax,rdx
+	and	rsi,rdx
+	and	rbx,rdx
+	and	rbp,rdx
+
+	add	r8,rax
+	adc	r9,rsi
+	mov	QWORD PTR[rdi],r8
+	adc	r10,rbx
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,rbp
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_sub_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sub_mod_256::
+sub_mod_256	ENDP
+
+
+PUBLIC	check_mod_256
+
+
+ALIGN	32
+check_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_check_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	mov	rax,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	mov	r10,QWORD PTR[16+rdi]
+	mov	r11,QWORD PTR[24+rdi]
+
+	mov	r8,rax
+	or	rax,r9
+	or	rax,r10
+	or	rax,r11
+
+	sub	r8,QWORD PTR[rsi]
+	sbb	r9,QWORD PTR[8+rsi]
+	sbb	r10,QWORD PTR[16+rsi]
+	sbb	r11,QWORD PTR[24+rsi]
+	sbb	rsi,rsi
+
+	mov	rdx,1
+	cmp	rax,0
+	cmovne	rax,rdx
+	and	rax,rsi
+$L$SEH_epilogue_check_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_check_mod_256::
+check_mod_256	ENDP
+
+
+PUBLIC	add_n_check_mod_256
+
+
+ALIGN	32
+add_n_check_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_add_n_check_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_add_n_check_mod_256::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	mov	rax,r8
+	adc	r10,QWORD PTR[16+rdx]
+	mov	rsi,r9
+	adc	r11,QWORD PTR[24+rdx]
+	sbb	rdx,rdx
+
+	mov	rbx,r10
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rcx]
+	mov	rbp,r11
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,rax
+	cmovc	r9,rsi
+	mov	QWORD PTR[rdi],r8
+	cmovc	r10,rbx
+	mov	QWORD PTR[8+rdi],r9
+	cmovc	r11,rbp
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	or	r8,r9
+	or	r10,r11
+	or	r8,r10
+	mov	rax,1
+	cmovz	rax,r8
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_add_n_check_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_add_n_check_mod_256::
+add_n_check_mod_256	ENDP
+
+
+PUBLIC	sub_n_check_mod_256
+
+
+ALIGN	32
+sub_n_check_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sub_n_check_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_sub_n_check_mod_256::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	rax,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	rsi,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[24+rcx]
+	sbb	rdx,rdx
+
+	and	rax,rdx
+	and	rsi,rdx
+	and	rbx,rdx
+	and	rbp,rdx
+
+	add	r8,rax
+	adc	r9,rsi
+	mov	QWORD PTR[rdi],r8
+	adc	r10,rbx
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,rbp
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	or	r8,r9
+	or	r10,r11
+	or	r8,r10
+	mov	rax,1
+	cmovz	rax,r8
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_sub_n_check_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sub_n_check_mod_256::
+sub_n_check_mod_256	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_add_mod_256
+	DD	imagerel $L$SEH_body_add_mod_256
+	DD	imagerel $L$SEH_info_add_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_add_mod_256
+	DD	imagerel $L$SEH_epilogue_add_mod_256
+	DD	imagerel $L$SEH_info_add_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_add_mod_256
+	DD	imagerel $L$SEH_end_add_mod_256
+	DD	imagerel $L$SEH_info_add_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_3_mod_256
+	DD	imagerel $L$SEH_body_mul_by_3_mod_256
+	DD	imagerel $L$SEH_info_mul_by_3_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_3_mod_256
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_256
+	DD	imagerel $L$SEH_info_mul_by_3_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_256
+	DD	imagerel $L$SEH_end_mul_by_3_mod_256
+	DD	imagerel $L$SEH_info_mul_by_3_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_lshift_mod_256
+	DD	imagerel $L$SEH_body_lshift_mod_256
+	DD	imagerel $L$SEH_info_lshift_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_lshift_mod_256
+	DD	imagerel $L$SEH_epilogue_lshift_mod_256
+	DD	imagerel $L$SEH_info_lshift_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_lshift_mod_256
+	DD	imagerel $L$SEH_end_lshift_mod_256
+	DD	imagerel $L$SEH_info_lshift_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_rshift_mod_256
+	DD	imagerel $L$SEH_body_rshift_mod_256
+	DD	imagerel $L$SEH_info_rshift_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_rshift_mod_256
+	DD	imagerel $L$SEH_epilogue_rshift_mod_256
+	DD	imagerel $L$SEH_info_rshift_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_rshift_mod_256
+	DD	imagerel $L$SEH_end_rshift_mod_256
+	DD	imagerel $L$SEH_info_rshift_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_cneg_mod_256
+	DD	imagerel $L$SEH_body_cneg_mod_256
+	DD	imagerel $L$SEH_info_cneg_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_cneg_mod_256
+	DD	imagerel $L$SEH_epilogue_cneg_mod_256
+	DD	imagerel $L$SEH_info_cneg_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_cneg_mod_256
+	DD	imagerel $L$SEH_end_cneg_mod_256
+	DD	imagerel $L$SEH_info_cneg_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_sub_mod_256
+	DD	imagerel $L$SEH_body_sub_mod_256
+	DD	imagerel $L$SEH_info_sub_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_sub_mod_256
+	DD	imagerel $L$SEH_epilogue_sub_mod_256
+	DD	imagerel $L$SEH_info_sub_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_sub_mod_256
+	DD	imagerel $L$SEH_end_sub_mod_256
+	DD	imagerel $L$SEH_info_sub_mod_256_epilogue
+
+	DD	imagerel $L$SEH_epilogue_check_mod_256
+	DD	imagerel $L$SEH_end_check_mod_256
+	DD	imagerel $L$SEH_info_check_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_add_n_check_mod_256
+	DD	imagerel $L$SEH_body_add_n_check_mod_256
+	DD	imagerel $L$SEH_info_add_n_check_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_add_n_check_mod_256
+	DD	imagerel $L$SEH_epilogue_add_n_check_mod_256
+	DD	imagerel $L$SEH_info_add_n_check_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_add_n_check_mod_256
+	DD	imagerel $L$SEH_end_add_n_check_mod_256
+	DD	imagerel $L$SEH_info_add_n_check_mod_256_epilogue
+
+	DD	imagerel $L$SEH_begin_sub_n_check_mod_256
+	DD	imagerel $L$SEH_body_sub_n_check_mod_256
+	DD	imagerel $L$SEH_info_sub_n_check_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_sub_n_check_mod_256
+	DD	imagerel $L$SEH_epilogue_sub_n_check_mod_256
+	DD	imagerel $L$SEH_info_sub_n_check_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_sub_n_check_mod_256
+	DD	imagerel $L$SEH_end_sub_n_check_mod_256
+	DD	imagerel $L$SEH_info_sub_n_check_mod_256_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_add_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_add_mod_256_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_add_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_3_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_3_mod_256_body::
+DB	1,0,11,0
+DB	000h,0c4h,000h,000h
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h,000h,000h,000h,000h
+$L$SEH_info_mul_by_3_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_lshift_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_lshift_mod_256_body::
+DB	1,0,11,0
+DB	000h,0c4h,000h,000h
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h,000h,000h,000h,000h
+$L$SEH_info_lshift_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_rshift_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_rshift_mod_256_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_rshift_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_cneg_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_cneg_mod_256_body::
+DB	1,0,11,0
+DB	000h,0c4h,000h,000h
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h,000h,000h,000h,000h
+$L$SEH_info_cneg_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sub_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sub_mod_256_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_sub_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_check_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_add_n_check_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_add_n_check_mod_256_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_add_n_check_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sub_n_check_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sub_n_check_mod_256_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_sub_n_check_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/add_mod_384-armv8.asm b/crypto/blst_src/build/win64/add_mod_384-armv8.asm
new file mode 100644
index 00000000000..4bf703a6da0
--- /dev/null
+++ b/crypto/blst_src/build/win64/add_mod_384-armv8.asm
@@ -0,0 +1,1001 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+
+	EXPORT	|add_mod_384|[FUNC]
+	ALIGN	32
+|add_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__add_mod_384| PROC
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+|__add_mod_384_ab_are_loaded|
+	adds	x10,x10,x16
+	adcs	x11,x11,x17
+	adcs	x12,x12,x19
+	adcs	x13,x13,x20
+	adcs	x14,x14,x21
+	adcs	x15,x15,x22
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csello	x10,x10,x16
+	csello	x11,x11,x17
+	csello	x12,x12,x19
+	csello	x13,x13,x20
+	csello	x14,x14,x21
+	csello	x15,x15,x22
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|add_mod_384x|[FUNC]
+	ALIGN	32
+|add_mod_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__add_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__add_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|rshift_mod_384|[FUNC]
+	ALIGN	32
+|rshift_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+|$Loop_rshift_mod_384|
+	sub	x2,x2,#1
+	bl	__rshift_mod_384
+	cbnz	x2,|$Loop_rshift_mod_384|
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__rshift_mod_384| PROC
+	sbfx	x22,x10,#0,#1
+	and	x16,x22,x4
+	and	x17,x22,x5
+	adds	x10,x10,x16
+	and	x19,x22,x6
+	adcs	x11,x11,x17
+	and	x20,x22,x7
+	adcs	x12,x12,x19
+	and	x21,x22,x8
+	adcs	x13,x13,x20
+	and	x22,x22,x9
+	adcs	x14,x14,x21
+	extr	x10,x11,x10,#1	// a[0:5] >>= 1
+	adcs	x15,x15,x22
+	extr	x11,x12,x11,#1
+	adc	x22,xzr,xzr
+	extr	x12,x13,x12,#1
+	extr	x13,x14,x13,#1
+	extr	x14,x15,x14,#1
+	extr	x15,x22,x15,#1
+	ret
+	ENDP
+
+
+
+	EXPORT	|div_by_2_mod_384|[FUNC]
+	ALIGN	32
+|div_by_2_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__rshift_mod_384
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|lshift_mod_384|[FUNC]
+	ALIGN	32
+|lshift_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+|$Loop_lshift_mod_384|
+	sub	x2,x2,#1
+	bl	__lshift_mod_384
+	cbnz	x2,|$Loop_lshift_mod_384|
+
+	ldr	x30,[sp,#8]
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__lshift_mod_384| PROC
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x16,x10,x4
+	sbcs	x17,x11,x5
+	sbcs	x19,x12,x6
+	sbcs	x20,x13,x7
+	sbcs	x21,x14,x8
+	sbcs	x22,x15,x9
+	sbcs	xzr,x3,xzr
+
+	csello	x10,x10,x16
+	csello	x11,x11,x17
+	csello	x12,x12,x19
+	csello	x13,x13,x20
+	csello	x14,x14,x21
+	csello	x15,x15,x22
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_3_mod_384|[FUNC]
+	ALIGN	32
+|mul_by_3_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_8_mod_384|[FUNC]
+	ALIGN	32
+|mul_by_8_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_3_mod_384x|[FUNC]
+	ALIGN	32
+|mul_by_3_mod_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+
+	bl	__add_mod_384_ab_are_loaded
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+
+	ldp	x16,x17,[x1,#48]
+	ldp	x19,x20,[x1,#64]
+	ldp	x21,x22,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_8_mod_384x|[FUNC]
+	ALIGN	32
+|mul_by_8_mod_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+	ldp	x14,x15,[x1,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	bl	__lshift_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|cneg_mod_384|[FUNC]
+	ALIGN	32
+|cneg_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x10,x11,[x1]
+	ldp	x4,x5,[x3]
+	ldp	x12,x13,[x1,#16]
+	ldp	x6,x7,[x3,#16]
+
+	subs	x16,x4,x10
+	ldp	x14,x15,[x1,#32]
+	ldp	x8,x9,[x3,#32]
+	orr	x3,x10,x11
+	sbcs	x17,x5,x11
+	orr	x3,x3,x12
+	sbcs	x19,x6,x12
+	orr	x3,x3,x13
+	sbcs	x20,x7,x13
+	orr	x3,x3,x14
+	sbcs	x21,x8,x14
+	orr	x3,x3,x15
+	sbc	x22,x9,x15
+
+	cmp	x3,#0
+	csetmne	x3
+	ands	x2,x2,x3
+
+	cseleq	x10,x10,x16
+	cseleq	x11,x11,x17
+	cseleq	x12,x12,x19
+	cseleq	x13,x13,x20
+	stp	x10,x11,[x0]
+	cseleq	x14,x14,x21
+	stp	x12,x13,[x0,#16]
+	cseleq	x15,x15,x22
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sub_mod_384|[FUNC]
+	ALIGN	32
+|sub_mod_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+	stp	x14,x15,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__sub_mod_384| PROC
+	ldp	x10,x11,[x1]
+	ldp	x16,x17,[x2]
+	ldp	x12,x13,[x1,#16]
+	ldp	x19,x20,[x2,#16]
+	ldp	x14,x15,[x1,#32]
+	ldp	x21,x22,[x2,#32]
+
+	subs	x10,x10,x16
+	sbcs	x11,x11,x17
+	sbcs	x12,x12,x19
+	sbcs	x13,x13,x20
+	sbcs	x14,x14,x21
+	sbcs	x15,x15,x22
+	sbc	x3,xzr,xzr
+
+	and	x16,x4,x3
+	and	x17,x5,x3
+	adds	x10,x10,x16
+	and	x19,x6,x3
+	adcs	x11,x11,x17
+	and	x20,x7,x3
+	adcs	x12,x12,x19
+	and	x21,x8,x3
+	adcs	x13,x13,x20
+	and	x22,x9,x3
+	adcs	x14,x14,x21
+	adc	x15,x15,x22
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|sub_mod_384x|[FUNC]
+	ALIGN	32
+|sub_mod_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x3]
+	ldp	x6,x7,[x3,#16]
+	ldp	x8,x9,[x3,#32]
+
+	bl	__sub_mod_384
+
+	stp	x10,x11,[x0]
+	add	x1,x1,#48
+	stp	x12,x13,[x0,#16]
+	add	x2,x2,#48
+	stp	x14,x15,[x0,#32]
+
+	bl	__sub_mod_384
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_by_1_plus_i_mod_384x|[FUNC]
+	ALIGN	32
+|mul_by_1_plus_i_mod_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x4,x5,[x2]
+	ldp	x6,x7,[x2,#16]
+	ldp	x8,x9,[x2,#32]
+	add	x2,x1,#48
+
+	bl	__sub_mod_384			// a->re - a->im
+
+	ldp	x16,x17,[x1]
+	ldp	x19,x20,[x1,#16]
+	ldp	x21,x22,[x1,#32]
+	stp	x10,x11,[x0]
+	ldp	x10,x11,[x1,#48]
+	stp	x12,x13,[x0,#16]
+	ldp	x12,x13,[x1,#64]
+	stp	x14,x15,[x0,#32]
+	ldp	x14,x15,[x1,#80]
+
+	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
+	ldr	x30,[sp,#8]
+
+	stp	x10,x11,[x0,#48]
+	stp	x12,x13,[x0,#64]
+	stp	x14,x15,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sgn0_pty_mod_384|[FUNC]
+	ALIGN	32
+|sgn0_pty_mod_384| PROC
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x0,x10,#1
+	adds	x10,x10,x10
+	adcs	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x3,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x3,x3,xzr
+
+	mvn	x3,x3
+	and	x3,x3,#2
+	orr	x0,x0,x3
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|sgn0_pty_mod_384x|[FUNC]
+	ALIGN	32
+|sgn0_pty_mod_384x| PROC
+	ldp	x10,x11,[x0]
+	ldp	x12,x13,[x0,#16]
+	ldp	x14,x15,[x0,#32]
+
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	ldp	x8,x9,[x1,#32]
+
+	and	x2,x10,#1
+	orr	x3,x10,x11
+	adds	x10,x10,x10
+	orr	x3,x3,x12
+	adcs	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	ldp	x10,x11,[x0,#48]
+	ldp	x12,x13,[x0,#64]
+	ldp	x14,x15,[x0,#80]
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x2,x2,x16
+
+	and	x0,x10,#1
+	orr	x1,x10,x11
+	adds	x10,x10,x10
+	orr	x1,x1,x12
+	adcs	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adc	x16,xzr,xzr
+
+	subs	x10,x10,x4
+	sbcs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbc	x16,x16,xzr
+
+	mvn	x16,x16
+	and	x16,x16,#2
+	orr	x0,x0,x16
+
+	cmp	x3,#0
+	cseleq	x3,x0,x2
+
+	cmp	x1,#0
+	cselne	x1,x0,x2
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3	// pack sign and parity
+
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_32|[FUNC]
+	ALIGN	32
+|vec_select_32| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_48|[FUNC]
+	ALIGN	32
+|vec_select_48| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_96|[FUNC]
+	ALIGN	32
+|vec_select_96| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_192|[FUNC]
+	ALIGN	32
+|vec_select_192| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_144|[FUNC]
+	ALIGN	32
+|vec_select_144| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	bit	v1.16b, v4.16b, v6.16b
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_select_288|[FUNC]
+	ALIGN	32
+|vec_select_288| PROC
+	dup	v6.2d, x3
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	cmeq	v6.2d, v6.2d, #0
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	ld1	{v0.2d, v1.2d, v2.2d}, [x1],#48
+	bit	v17.16b, v20.16b, v6.16b
+	ld1	{v3.2d, v4.2d, v5.2d}, [x2],#48
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0],#48
+	bit	v0.16b, v3.16b, v6.16b
+	ld1	{v16.2d, v17.2d, v18.2d}, [x1],#48
+	bit	v1.16b, v4.16b, v6.16b
+	ld1	{v19.2d, v20.2d, v21.2d}, [x2],#48
+	bit	v2.16b, v5.16b, v6.16b
+	st1	{v0.2d, v1.2d, v2.2d}, [x0],#48
+	bit	v16.16b, v19.16b, v6.16b
+	bit	v17.16b, v20.16b, v6.16b
+	bit	v18.16b, v21.16b, v6.16b
+	st1	{v16.2d, v17.2d, v18.2d}, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_prefetch|[FUNC]
+	ALIGN	32
+|vec_prefetch| PROC
+	add	x1, x1, x0
+	sub	x1, x1, #1
+	mov	x2, #64
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	cselhi	x2,xzr,x2
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	cselhi	x2,xzr,x2
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	cselhi	x2,xzr,x2
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	cselhi	x2,xzr,x2
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	cselhi	x2,xzr,x2
+	prfm	pldl1keep, [x0]
+	add	x0, x0, x2
+	cmp	x0, x1
+	cselhi	x0,x1,x0
+	prfm	pldl1keep, [x0]
+	ret
+	ENDP
+
+
+	EXPORT	|vec_is_zero_16x|[FUNC]
+	ALIGN	32
+|vec_is_zero_16x| PROC
+	ld1	{v0.2d}, [x0], #16
+	lsr	x1, x1, #4
+	sub	x1, x1, #1
+	cbz	x1, |$Loop_is_zero_done|
+
+|$Loop_is_zero|
+	ld1	{v1.2d}, [x0], #16
+	orr	v0.16b, v0.16b, v1.16b
+	sub	x1, x1, #1
+	cbnz	x1, |$Loop_is_zero|
+
+|$Loop_is_zero_done|
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	cseleq	x0,x0,xzr
+	ret
+	ENDP
+
+
+	EXPORT	|vec_is_equal_16x|[FUNC]
+	ALIGN	32
+|vec_is_equal_16x| PROC
+	ld1	{v0.2d}, [x0], #16
+	ld1	{v1.2d}, [x1], #16
+	lsr	x2, x2, #4
+	eor	v0.16b, v0.16b, v1.16b
+
+|$Loop_is_equal|
+	sub	x2, x2, #1
+	cbz	x2, |$Loop_is_equal_done|
+	ld1	{v1.2d}, [x0], #16
+	ld1	{v2.2d}, [x1], #16
+	eor	v1.16b, v1.16b, v2.16b
+	orr	v0.16b, v0.16b, v1.16b
+	b	|$Loop_is_equal|
+	nop
+
+|$Loop_is_equal_done|
+	dup	v1.2d, v0.d[1]
+	orr	v0.16b, v0.16b, v1.16b
+	mov	x1, v0.d[0]
+	mov	x0, #1
+	cmp	x1, #0
+	cseleq	x0,x0,xzr
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm
new file mode 100644
index 00000000000..8a7b9e255db
--- /dev/null
+++ b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm
@@ -0,0 +1,2504 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	add_mod_384
+
+
+ALIGN	32
+add_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_add_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_add_mod_384::
+
+
+	call	__add_mod_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_add_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_add_mod_384::
+add_mod_384	ENDP
+
+
+ALIGN	32
+__add_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+__add_mod_384_a_is_loaded::
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	adc	r10,QWORD PTR[16+rdx]
+	mov	r14,r8
+	adc	r11,QWORD PTR[24+rdx]
+	mov	r15,r9
+	adc	r12,QWORD PTR[32+rdx]
+	mov	rax,r10
+	adc	r13,QWORD PTR[40+rdx]
+	mov	rbx,r11
+	sbb	rdx,rdx
+
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	mov	rbp,r12
+	sbb	r10,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rcx]
+	mov	rsi,r13
+	sbb	r13,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	mov	QWORD PTR[rdi],r8
+	cmovc	r11,rbx
+	mov	QWORD PTR[8+rdi],r9
+	cmovc	r12,rbp
+	mov	QWORD PTR[16+rdi],r10
+	cmovc	r13,rsi
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__add_mod_384	ENDP
+
+PUBLIC	add_mod_384x
+
+
+ALIGN	32
+add_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_add_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,24
+
+$L$SEH_body_add_mod_384x::
+
+
+	mov	QWORD PTR[rsp],rsi
+	mov	QWORD PTR[8+rsp],rdx
+	lea	rsi,QWORD PTR[48+rsi]
+	lea	rdx,QWORD PTR[48+rdx]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__add_mod_384
+
+	mov	rsi,QWORD PTR[rsp]
+	mov	rdx,QWORD PTR[8+rsp]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__add_mod_384
+
+	mov	r15,QWORD PTR[((24+0))+rsp]
+
+	mov	r14,QWORD PTR[((24+8))+rsp]
+
+	mov	r13,QWORD PTR[((24+16))+rsp]
+
+	mov	r12,QWORD PTR[((24+24))+rsp]
+
+	mov	rbx,QWORD PTR[((24+32))+rsp]
+
+	mov	rbp,QWORD PTR[((24+40))+rsp]
+
+	lea	rsp,QWORD PTR[((24+48))+rsp]
+
+$L$SEH_epilogue_add_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_add_mod_384x::
+add_mod_384x	ENDP
+
+
+PUBLIC	rshift_mod_384
+
+
+ALIGN	32
+rshift_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_rshift_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_rshift_mod_384::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+$L$oop_rshift_mod_384::
+	call	__rshift_mod_384
+	dec	edx
+	jnz	$L$oop_rshift_mod_384
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_rshift_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_rshift_mod_384::
+rshift_mod_384	ENDP
+
+
+ALIGN	32
+__rshift_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rsi,1
+	mov	r14,QWORD PTR[rcx]
+	and	rsi,r8
+	mov	r15,QWORD PTR[8+rcx]
+	neg	rsi
+	mov	rax,QWORD PTR[16+rcx]
+	and	r14,rsi
+	mov	rbx,QWORD PTR[24+rcx]
+	and	r15,rsi
+	mov	rbp,QWORD PTR[32+rcx]
+	and	rax,rsi
+	and	rbx,rsi
+	and	rbp,rsi
+	and	rsi,QWORD PTR[40+rcx]
+
+	add	r14,r8
+	adc	r15,r9
+	adc	rax,r10
+	adc	rbx,r11
+	adc	rbp,r12
+	adc	rsi,r13
+	sbb	r13,r13
+
+	shr	r14,1
+	mov	r8,r15
+	shr	r15,1
+	mov	r9,rax
+	shr	rax,1
+	mov	r10,rbx
+	shr	rbx,1
+	mov	r11,rbp
+	shr	rbp,1
+	mov	r12,rsi
+	shr	rsi,1
+	shl	r8,63
+	shl	r9,63
+	or	r8,r14
+	shl	r10,63
+	or	r9,r15
+	shl	r11,63
+	or	r10,rax
+	shl	r12,63
+	or	r11,rbx
+	shl	r13,63
+	or	r12,rbp
+	or	r13,rsi
+
+	DB	0F3h,0C3h		;repret
+__rshift_mod_384	ENDP
+
+PUBLIC	div_by_2_mod_384
+
+
+ALIGN	32
+div_by_2_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_div_by_2_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_div_by_2_mod_384::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	rcx,rdx
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	call	__rshift_mod_384
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_div_by_2_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_div_by_2_mod_384::
+div_by_2_mod_384	ENDP
+
+
+PUBLIC	lshift_mod_384
+
+
+ALIGN	32
+lshift_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_lshift_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_lshift_mod_384::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+$L$oop_lshift_mod_384::
+	add	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	mov	r14,r8
+	adc	r11,r11
+	mov	r15,r9
+	adc	r12,r12
+	mov	rax,r10
+	adc	r13,r13
+	mov	rbx,r11
+	sbb	rdi,rdi
+
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	mov	rbp,r12
+	sbb	r10,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rcx]
+	mov	rsi,r13
+	sbb	r13,QWORD PTR[40+rcx]
+	sbb	rdi,0
+
+	mov	rdi,QWORD PTR[rsp]
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	cmovc	r11,rbx
+	cmovc	r12,rbp
+	cmovc	r13,rsi
+
+	dec	edx
+	jnz	$L$oop_lshift_mod_384
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_lshift_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_lshift_mod_384::
+lshift_mod_384	ENDP
+
+
+ALIGN	32
+__lshift_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	add	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	mov	r14,r8
+	adc	r11,r11
+	mov	r15,r9
+	adc	r12,r12
+	mov	rax,r10
+	adc	r13,r13
+	mov	rbx,r11
+	sbb	rdx,rdx
+
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	mov	rbp,r12
+	sbb	r10,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rcx]
+	mov	rsi,r13
+	sbb	r13,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	cmovc	r11,rbx
+	cmovc	r12,rbp
+	cmovc	r13,rsi
+
+	DB	0F3h,0C3h		;repret
+__lshift_mod_384	ENDP
+
+
+PUBLIC	mul_by_3_mod_384
+
+
+ALIGN	32
+mul_by_3_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_3_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rsi
+
+$L$SEH_body_mul_by_3_mod_384::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	rcx,rdx
+
+	call	__lshift_mod_384
+
+	mov	rdx,QWORD PTR[rsp]
+	call	__add_mod_384_a_is_loaded
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mul_by_3_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_3_mod_384::
+mul_by_3_mod_384	ENDP
+
+PUBLIC	mul_by_8_mod_384
+
+
+ALIGN	32
+mul_by_8_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_8_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_mul_by_8_mod_384::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	rcx,rdx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mul_by_8_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_8_mod_384::
+mul_by_8_mod_384	ENDP
+
+
+PUBLIC	mul_by_3_mod_384x
+
+
+ALIGN	32
+mul_by_3_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_3_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rsi
+
+$L$SEH_body_mul_by_3_mod_384x::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	rcx,rdx
+
+	call	__lshift_mod_384
+
+	mov	rdx,QWORD PTR[rsp]
+	call	__add_mod_384_a_is_loaded
+
+	mov	rsi,QWORD PTR[rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+
+	mov	r8,QWORD PTR[48+rsi]
+	mov	r9,QWORD PTR[56+rsi]
+	mov	r10,QWORD PTR[64+rsi]
+	mov	r11,QWORD PTR[72+rsi]
+	mov	r12,QWORD PTR[80+rsi]
+	mov	r13,QWORD PTR[88+rsi]
+
+	call	__lshift_mod_384
+
+	mov	rdx,8*6
+	add	rdx,QWORD PTR[rsp]
+	call	__add_mod_384_a_is_loaded
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mul_by_3_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_3_mod_384x::
+mul_by_3_mod_384x	ENDP
+
+PUBLIC	mul_by_8_mod_384x
+
+
+ALIGN	32
+mul_by_8_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_8_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rsi
+
+$L$SEH_body_mul_by_8_mod_384x::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	rcx,rdx
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	rsi,QWORD PTR[rsp]
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	mov	r8,QWORD PTR[((48+0))+rsi]
+	mov	r9,QWORD PTR[((48+8))+rsi]
+	mov	r10,QWORD PTR[((48+16))+rsi]
+	mov	r11,QWORD PTR[((48+24))+rsi]
+	mov	r12,QWORD PTR[((48+32))+rsi]
+	mov	r13,QWORD PTR[((48+40))+rsi]
+
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+	call	__lshift_mod_384
+
+	mov	QWORD PTR[((48+0))+rdi],r8
+	mov	QWORD PTR[((48+8))+rdi],r9
+	mov	QWORD PTR[((48+16))+rdi],r10
+	mov	QWORD PTR[((48+24))+rdi],r11
+	mov	QWORD PTR[((48+32))+rdi],r12
+	mov	QWORD PTR[((48+40))+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mul_by_8_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_8_mod_384x::
+mul_by_8_mod_384x	ENDP
+
+
+PUBLIC	cneg_mod_384
+
+
+ALIGN	32
+cneg_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_cneg_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdx
+
+$L$SEH_body_cneg_mod_384::
+
+
+	mov	rdx,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r8,rdx
+	mov	r11,QWORD PTR[24+rsi]
+	or	rdx,r9
+	mov	r12,QWORD PTR[32+rsi]
+	or	rdx,r10
+	mov	r13,QWORD PTR[40+rsi]
+	or	rdx,r11
+	mov	rsi,-1
+	or	rdx,r12
+	or	rdx,r13
+
+	mov	r14,QWORD PTR[rcx]
+	cmovnz	rdx,rsi
+	mov	r15,QWORD PTR[8+rcx]
+	mov	rax,QWORD PTR[16+rcx]
+	and	r14,rdx
+	mov	rbx,QWORD PTR[24+rcx]
+	and	r15,rdx
+	mov	rbp,QWORD PTR[32+rcx]
+	and	rax,rdx
+	mov	rsi,QWORD PTR[40+rcx]
+	and	rbx,rdx
+	mov	rcx,QWORD PTR[rsp]
+	and	rbp,rdx
+	and	rsi,rdx
+
+	sub	r14,r8
+	sbb	r15,r9
+	sbb	rax,r10
+	sbb	rbx,r11
+	sbb	rbp,r12
+	sbb	rsi,r13
+
+	or	rcx,rcx
+
+	cmovz	r14,r8
+	cmovz	r15,r9
+	cmovz	rax,r10
+	mov	QWORD PTR[rdi],r14
+	cmovz	rbx,r11
+	mov	QWORD PTR[8+rdi],r15
+	cmovz	rbp,r12
+	mov	QWORD PTR[16+rdi],rax
+	cmovz	rsi,r13
+	mov	QWORD PTR[24+rdi],rbx
+	mov	QWORD PTR[32+rdi],rbp
+	mov	QWORD PTR[40+rdi],rsi
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_cneg_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_cneg_mod_384::
+cneg_mod_384	ENDP
+
+
+PUBLIC	sub_mod_384
+
+
+ALIGN	32
+sub_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sub_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sub_mod_384::
+
+
+	call	__sub_mod_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sub_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sub_mod_384::
+sub_mod_384	ENDP
+
+
+ALIGN	32
+__sub_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	r15,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rax,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbx,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rbp,QWORD PTR[32+rcx]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	rsi,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r14,rdx
+	and	r15,rdx
+	and	rax,rdx
+	and	rbx,rdx
+	and	rbp,rdx
+	and	rsi,rdx
+
+	add	r8,r14
+	adc	r9,r15
+	mov	QWORD PTR[rdi],r8
+	adc	r10,rax
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,rbx
+	mov	QWORD PTR[16+rdi],r10
+	adc	r12,rbp
+	mov	QWORD PTR[24+rdi],r11
+	adc	r13,rsi
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384	ENDP
+
+PUBLIC	sub_mod_384x
+
+
+ALIGN	32
+sub_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sub_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,24
+
+$L$SEH_body_sub_mod_384x::
+
+
+	mov	QWORD PTR[rsp],rsi
+	mov	QWORD PTR[8+rsp],rdx
+	lea	rsi,QWORD PTR[48+rsi]
+	lea	rdx,QWORD PTR[48+rdx]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__sub_mod_384
+
+	mov	rsi,QWORD PTR[rsp]
+	mov	rdx,QWORD PTR[8+rsp]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__sub_mod_384
+
+	mov	r15,QWORD PTR[((24+0))+rsp]
+
+	mov	r14,QWORD PTR[((24+8))+rsp]
+
+	mov	r13,QWORD PTR[((24+16))+rsp]
+
+	mov	r12,QWORD PTR[((24+24))+rsp]
+
+	mov	rbx,QWORD PTR[((24+32))+rsp]
+
+	mov	rbp,QWORD PTR[((24+40))+rsp]
+
+	lea	rsp,QWORD PTR[((24+48))+rsp]
+
+$L$SEH_epilogue_sub_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sub_mod_384x::
+sub_mod_384x	ENDP
+PUBLIC	mul_by_1_plus_i_mod_384x
+
+
+ALIGN	32
+mul_by_1_plus_i_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_by_1_plus_i_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,56
+
+$L$SEH_body_mul_by_1_plus_i_mod_384x::
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,r8
+	add	r8,QWORD PTR[48+rsi]
+	mov	r15,r9
+	adc	r9,QWORD PTR[56+rsi]
+	mov	rax,r10
+	adc	r10,QWORD PTR[64+rsi]
+	mov	rbx,r11
+	adc	r11,QWORD PTR[72+rsi]
+	mov	rcx,r12
+	adc	r12,QWORD PTR[80+rsi]
+	mov	rbp,r13
+	adc	r13,QWORD PTR[88+rsi]
+	mov	QWORD PTR[48+rsp],rdi
+	sbb	rdi,rdi
+
+	sub	r14,QWORD PTR[48+rsi]
+	sbb	r15,QWORD PTR[56+rsi]
+	sbb	rax,QWORD PTR[64+rsi]
+	sbb	rbx,QWORD PTR[72+rsi]
+	sbb	rcx,QWORD PTR[80+rsi]
+	sbb	rbp,QWORD PTR[88+rsi]
+	sbb	rsi,rsi
+
+	mov	QWORD PTR[rsp],r8
+	mov	r8,QWORD PTR[rdx]
+	mov	QWORD PTR[8+rsp],r9
+	mov	r9,QWORD PTR[8+rdx]
+	mov	QWORD PTR[16+rsp],r10
+	mov	r10,QWORD PTR[16+rdx]
+	mov	QWORD PTR[24+rsp],r11
+	mov	r11,QWORD PTR[24+rdx]
+	mov	QWORD PTR[32+rsp],r12
+	and	r8,rsi
+	mov	r12,QWORD PTR[32+rdx]
+	mov	QWORD PTR[40+rsp],r13
+	and	r9,rsi
+	mov	r13,QWORD PTR[40+rdx]
+	and	r10,rsi
+	and	r11,rsi
+	and	r12,rsi
+	and	r13,rsi
+	mov	rsi,QWORD PTR[48+rsp]
+
+	add	r14,r8
+	mov	r8,QWORD PTR[rsp]
+	adc	r15,r9
+	mov	r9,QWORD PTR[8+rsp]
+	adc	rax,r10
+	mov	r10,QWORD PTR[16+rsp]
+	adc	rbx,r11
+	mov	r11,QWORD PTR[24+rsp]
+	adc	rcx,r12
+	mov	r12,QWORD PTR[32+rsp]
+	adc	rbp,r13
+	mov	r13,QWORD PTR[40+rsp]
+
+	mov	QWORD PTR[rsi],r14
+	mov	r14,r8
+	mov	QWORD PTR[8+rsi],r15
+	mov	QWORD PTR[16+rsi],rax
+	mov	r15,r9
+	mov	QWORD PTR[24+rsi],rbx
+	mov	QWORD PTR[32+rsi],rcx
+	mov	rax,r10
+	mov	QWORD PTR[40+rsi],rbp
+
+	sub	r8,QWORD PTR[rdx]
+	mov	rbx,r11
+	sbb	r9,QWORD PTR[8+rdx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rcx,r12
+	sbb	r11,QWORD PTR[24+rdx]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rbp,r13
+	sbb	r13,QWORD PTR[40+rdx]
+	sbb	rdi,0
+
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	mov	QWORD PTR[48+rsi],r8
+	cmovc	r11,rbx
+	mov	QWORD PTR[56+rsi],r9
+	cmovc	r12,rcx
+	mov	QWORD PTR[64+rsi],r10
+	cmovc	r13,rbp
+	mov	QWORD PTR[72+rsi],r11
+	mov	QWORD PTR[80+rsi],r12
+	mov	QWORD PTR[88+rsi],r13
+
+	mov	r15,QWORD PTR[((56+0))+rsp]
+
+	mov	r14,QWORD PTR[((56+8))+rsp]
+
+	mov	r13,QWORD PTR[((56+16))+rsp]
+
+	mov	r12,QWORD PTR[((56+24))+rsp]
+
+	mov	rbx,QWORD PTR[((56+32))+rsp]
+
+	mov	rbp,QWORD PTR[((56+40))+rsp]
+
+	lea	rsp,QWORD PTR[((56+48))+rsp]
+
+$L$SEH_epilogue_mul_by_1_plus_i_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_by_1_plus_i_mod_384x::
+mul_by_1_plus_i_mod_384x	ENDP
+PUBLIC	sgn0_pty_mod_384
+
+
+ALIGN	32
+sgn0_pty_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0_pty_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+$L$SEH_body_sgn0_pty_mod_384::
+
+	mov	r8,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	mov	r10,QWORD PTR[16+rdi]
+	mov	r11,QWORD PTR[24+rdi]
+	mov	rcx,QWORD PTR[32+rdi]
+	mov	rdx,QWORD PTR[40+rdi]
+
+	xor	rax,rax
+	mov	rdi,r8
+	add	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rcx,rcx
+	adc	rdx,rdx
+	adc	rax,0
+
+	sub	r8,QWORD PTR[rsi]
+	sbb	r9,QWORD PTR[8+rsi]
+	sbb	r10,QWORD PTR[16+rsi]
+	sbb	r11,QWORD PTR[24+rsi]
+	sbb	rcx,QWORD PTR[32+rsi]
+	sbb	rdx,QWORD PTR[40+rsi]
+	sbb	rax,0
+
+	not	rax
+	and	rdi,1
+	and	rax,2
+	or	rax,rdi
+
+$L$SEH_epilogue_sgn0_pty_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0_pty_mod_384::
+sgn0_pty_mod_384	ENDP
+
+PUBLIC	sgn0_pty_mod_384x
+
+
+ALIGN	32
+sgn0_pty_mod_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0_pty_mod_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	push	rbp
+
+	push	rbx
+
+	sub	rsp,8
+
+$L$SEH_body_sgn0_pty_mod_384x::
+
+
+	mov	r8,QWORD PTR[48+rdi]
+	mov	r9,QWORD PTR[56+rdi]
+	mov	r10,QWORD PTR[64+rdi]
+	mov	r11,QWORD PTR[72+rdi]
+	mov	rcx,QWORD PTR[80+rdi]
+	mov	rdx,QWORD PTR[88+rdi]
+
+	mov	rbx,r8
+	or	r8,r9
+	or	r8,r10
+	or	r8,r11
+	or	r8,rcx
+	or	r8,rdx
+
+	lea	rax,QWORD PTR[rdi]
+	xor	rdi,rdi
+	mov	rbp,rbx
+	add	rbx,rbx
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rcx,rcx
+	adc	rdx,rdx
+	adc	rdi,0
+
+	sub	rbx,QWORD PTR[rsi]
+	sbb	r9,QWORD PTR[8+rsi]
+	sbb	r10,QWORD PTR[16+rsi]
+	sbb	r11,QWORD PTR[24+rsi]
+	sbb	rcx,QWORD PTR[32+rsi]
+	sbb	rdx,QWORD PTR[40+rsi]
+	sbb	rdi,0
+
+	mov	QWORD PTR[rsp],r8
+	not	rdi
+	and	rbp,1
+	and	rdi,2
+	or	rdi,rbp
+
+	mov	r8,QWORD PTR[rax]
+	mov	r9,QWORD PTR[8+rax]
+	mov	r10,QWORD PTR[16+rax]
+	mov	r11,QWORD PTR[24+rax]
+	mov	rcx,QWORD PTR[32+rax]
+	mov	rdx,QWORD PTR[40+rax]
+
+	mov	rbx,r8
+	or	r8,r9
+	or	r8,r10
+	or	r8,r11
+	or	r8,rcx
+	or	r8,rdx
+
+	xor	rax,rax
+	mov	rbp,rbx
+	add	rbx,rbx
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rcx,rcx
+	adc	rdx,rdx
+	adc	rax,0
+
+	sub	rbx,QWORD PTR[rsi]
+	sbb	r9,QWORD PTR[8+rsi]
+	sbb	r10,QWORD PTR[16+rsi]
+	sbb	r11,QWORD PTR[24+rsi]
+	sbb	rcx,QWORD PTR[32+rsi]
+	sbb	rdx,QWORD PTR[40+rsi]
+	sbb	rax,0
+
+	mov	rbx,QWORD PTR[rsp]
+
+	not	rax
+
+	test	r8,r8
+	cmovz	rbp,rdi
+
+	test	rbx,rbx
+	cmovnz	rax,rdi
+
+	and	rbp,1
+	and	rax,2
+	or	rax,rbp
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_sgn0_pty_mod_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0_pty_mod_384x::
+sgn0_pty_mod_384x	ENDP
+PUBLIC	vec_select_32
+
+
+ALIGN	32
+vec_select_32	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[16+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[16+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[16+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-16))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-16))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-16)+rcx],xmm0
+	pand	xmm2,xmm4
+	pand	xmm3,xmm5
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-16)+rcx],xmm2
+	DB	0F3h,0C3h		;repret
+vec_select_32	ENDP
+PUBLIC	vec_select_48
+
+
+ALIGN	32
+vec_select_48	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[24+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[24+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[24+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-24))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-24))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-24)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((16+16-24))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((16+16-24))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-24)+rcx],xmm2
+	pand	xmm0,xmm4
+	pand	xmm1,xmm5
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(32-24)+rcx],xmm0
+	DB	0F3h,0C3h		;repret
+vec_select_48	ENDP
+PUBLIC	vec_select_96
+
+
+ALIGN	32
+vec_select_96	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[48+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[48+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[48+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-48))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-48))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-48)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((16+16-48))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((16+16-48))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-48)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((32+16-48))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((32+16-48))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(32-48)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((48+16-48))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((48+16-48))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(48-48)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((64+16-48))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((64+16-48))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(64-48)+rcx],xmm0
+	pand	xmm2,xmm4
+	pand	xmm3,xmm5
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(80-48)+rcx],xmm2
+	DB	0F3h,0C3h		;repret
+vec_select_96	ENDP
+PUBLIC	vec_select_192
+
+
+ALIGN	32
+vec_select_192	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[96+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[96+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[96+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((16+16-96))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((16+16-96))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-96)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((32+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((32+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(32-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((48+16-96))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((48+16-96))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(48-96)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((64+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((64+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(64-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((80+16-96))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((80+16-96))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(80-96)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((96+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((96+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(96-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((112+16-96))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((112+16-96))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(112-96)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((128+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((128+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(128-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((144+16-96))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((144+16-96))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(144-96)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((160+16-96))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((160+16-96))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(160-96)+rcx],xmm0
+	pand	xmm2,xmm4
+	pand	xmm3,xmm5
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(176-96)+rcx],xmm2
+	DB	0F3h,0C3h		;repret
+vec_select_192	ENDP
+PUBLIC	vec_select_144
+
+
+ALIGN	32
+vec_select_144	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[72+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[72+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[72+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-72))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-72))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-72)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((16+16-72))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((16+16-72))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-72)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((32+16-72))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((32+16-72))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(32-72)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((48+16-72))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((48+16-72))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(48-72)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((64+16-72))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((64+16-72))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(64-72)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((80+16-72))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((80+16-72))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(80-72)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((96+16-72))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((96+16-72))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(96-72)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((112+16-72))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((112+16-72))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(112-72)+rcx],xmm2
+	pand	xmm0,xmm4
+	pand	xmm1,xmm5
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(128-72)+rcx],xmm0
+	DB	0F3h,0C3h		;repret
+vec_select_144	ENDP
+PUBLIC	vec_select_288
+
+
+ALIGN	32
+vec_select_288	PROC PUBLIC
+	DB	243,15,30,250
+	movd	xmm5,r9d
+	pxor	xmm4,xmm4
+	pshufd	xmm5,xmm5,0
+	movdqu	xmm0,XMMWORD PTR[rdx]
+	lea	rdx,QWORD PTR[144+rdx]
+	pcmpeqd	xmm5,xmm4
+	movdqu	xmm1,XMMWORD PTR[r8]
+	lea	r8,QWORD PTR[144+r8]
+	pcmpeqd	xmm4,xmm5
+	lea	rcx,QWORD PTR[144+rcx]
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((0+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((0+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(0-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((16+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((16+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(16-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((32+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((32+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(32-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((48+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((48+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(48-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((64+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((64+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(64-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((80+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((80+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(80-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((96+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((96+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(96-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((112+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((112+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(112-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((128+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((128+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(128-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((144+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((144+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(144-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((160+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((160+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(160-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((176+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((176+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(176-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((192+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((192+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(192-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((208+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((208+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(208-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((224+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((224+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(224-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	movdqu	xmm0,XMMWORD PTR[((240+16-144))+rdx]
+	pand	xmm3,xmm5
+	movdqu	xmm1,XMMWORD PTR[((240+16-144))+r8]
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(240-144)+rcx],xmm2
+	pand	xmm0,xmm4
+	movdqu	xmm2,XMMWORD PTR[((256+16-144))+rdx]
+	pand	xmm1,xmm5
+	movdqu	xmm3,XMMWORD PTR[((256+16-144))+r8]
+	por	xmm0,xmm1
+	movdqu	XMMWORD PTR[(256-144)+rcx],xmm0
+	pand	xmm2,xmm4
+	pand	xmm3,xmm5
+	por	xmm2,xmm3
+	movdqu	XMMWORD PTR[(272-144)+rcx],xmm2
+	DB	0F3h,0C3h		;repret
+vec_select_288	ENDP
+PUBLIC	vec_prefetch
+
+
+ALIGN	32
+vec_prefetch	PROC PUBLIC
+	DB	243,15,30,250
+	lea	rdx,QWORD PTR[((-1))+rdx*1+rcx]
+	mov	rax,64
+	xor	r8,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	cmova	rax,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	cmova	rax,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	cmova	rax,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	cmova	rax,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	cmova	rax,r8
+	prefetchnta	[rcx]
+	lea	rcx,QWORD PTR[rax*1+rcx]
+	cmp	rcx,rdx
+	cmova	rcx,rdx
+	prefetchnta	[rcx]
+	DB	0F3h,0C3h		;repret
+vec_prefetch	ENDP
+PUBLIC	vec_is_zero_16x
+
+
+ALIGN	32
+vec_is_zero_16x	PROC PUBLIC
+	DB	243,15,30,250
+	shr	edx,4
+	movdqu	xmm0,XMMWORD PTR[rcx]
+	lea	rcx,QWORD PTR[16+rcx]
+
+$L$oop_is_zero::
+	dec	edx
+	jz	$L$oop_is_zero_done
+	movdqu	xmm1,XMMWORD PTR[rcx]
+	lea	rcx,QWORD PTR[16+rcx]
+	por	xmm0,xmm1
+	jmp	$L$oop_is_zero
+
+$L$oop_is_zero_done::
+	pshufd	xmm1,xmm0,04eh
+	por	xmm0,xmm1
+DB	102,72,15,126,192
+	inc	edx
+	test	rax,rax
+	cmovnz	eax,edx
+	xor	eax,1
+	DB	0F3h,0C3h		;repret
+vec_is_zero_16x	ENDP
+PUBLIC	vec_is_equal_16x
+
+
+ALIGN	32
+vec_is_equal_16x	PROC PUBLIC
+	DB	243,15,30,250
+	shr	r8d,4
+	movdqu	xmm0,XMMWORD PTR[rcx]
+	movdqu	xmm1,XMMWORD PTR[rdx]
+	sub	rdx,rcx
+	lea	rcx,QWORD PTR[16+rcx]
+	pxor	xmm0,xmm1
+
+$L$oop_is_equal::
+	dec	r8d
+	jz	$L$oop_is_equal_done
+	movdqu	xmm1,XMMWORD PTR[rcx]
+	movdqu	xmm2,XMMWORD PTR[rdx*1+rcx]
+	lea	rcx,QWORD PTR[16+rcx]
+	pxor	xmm1,xmm2
+	por	xmm0,xmm1
+	jmp	$L$oop_is_equal
+
+$L$oop_is_equal_done::
+	pshufd	xmm1,xmm0,04eh
+	por	xmm0,xmm1
+DB	102,72,15,126,192
+	inc	r8d
+	test	rax,rax
+	cmovnz	eax,r8d
+	xor	eax,1
+	DB	0F3h,0C3h		;repret
+vec_is_equal_16x	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_add_mod_384
+	DD	imagerel $L$SEH_body_add_mod_384
+	DD	imagerel $L$SEH_info_add_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_add_mod_384
+	DD	imagerel $L$SEH_epilogue_add_mod_384
+	DD	imagerel $L$SEH_info_add_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_add_mod_384
+	DD	imagerel $L$SEH_end_add_mod_384
+	DD	imagerel $L$SEH_info_add_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_add_mod_384x
+	DD	imagerel $L$SEH_body_add_mod_384x
+	DD	imagerel $L$SEH_info_add_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_add_mod_384x
+	DD	imagerel $L$SEH_epilogue_add_mod_384x
+	DD	imagerel $L$SEH_info_add_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_add_mod_384x
+	DD	imagerel $L$SEH_end_add_mod_384x
+	DD	imagerel $L$SEH_info_add_mod_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_rshift_mod_384
+	DD	imagerel $L$SEH_body_rshift_mod_384
+	DD	imagerel $L$SEH_info_rshift_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_rshift_mod_384
+	DD	imagerel $L$SEH_epilogue_rshift_mod_384
+	DD	imagerel $L$SEH_info_rshift_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_rshift_mod_384
+	DD	imagerel $L$SEH_end_rshift_mod_384
+	DD	imagerel $L$SEH_info_rshift_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_div_by_2_mod_384
+	DD	imagerel $L$SEH_body_div_by_2_mod_384
+	DD	imagerel $L$SEH_info_div_by_2_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_div_by_2_mod_384
+	DD	imagerel $L$SEH_epilogue_div_by_2_mod_384
+	DD	imagerel $L$SEH_info_div_by_2_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_div_by_2_mod_384
+	DD	imagerel $L$SEH_end_div_by_2_mod_384
+	DD	imagerel $L$SEH_info_div_by_2_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_lshift_mod_384
+	DD	imagerel $L$SEH_body_lshift_mod_384
+	DD	imagerel $L$SEH_info_lshift_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_lshift_mod_384
+	DD	imagerel $L$SEH_epilogue_lshift_mod_384
+	DD	imagerel $L$SEH_info_lshift_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_lshift_mod_384
+	DD	imagerel $L$SEH_end_lshift_mod_384
+	DD	imagerel $L$SEH_info_lshift_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_3_mod_384
+	DD	imagerel $L$SEH_body_mul_by_3_mod_384
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_3_mod_384
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_384
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_384
+	DD	imagerel $L$SEH_end_mul_by_3_mod_384
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_8_mod_384
+	DD	imagerel $L$SEH_body_mul_by_8_mod_384
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_8_mod_384
+	DD	imagerel $L$SEH_epilogue_mul_by_8_mod_384
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_8_mod_384
+	DD	imagerel $L$SEH_end_mul_by_8_mod_384
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_body_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_end_mul_by_3_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_body_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_epilogue_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_end_mul_by_8_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_cneg_mod_384
+	DD	imagerel $L$SEH_body_cneg_mod_384
+	DD	imagerel $L$SEH_info_cneg_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_cneg_mod_384
+	DD	imagerel $L$SEH_epilogue_cneg_mod_384
+	DD	imagerel $L$SEH_info_cneg_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_cneg_mod_384
+	DD	imagerel $L$SEH_end_cneg_mod_384
+	DD	imagerel $L$SEH_info_cneg_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sub_mod_384
+	DD	imagerel $L$SEH_body_sub_mod_384
+	DD	imagerel $L$SEH_info_sub_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_sub_mod_384
+	DD	imagerel $L$SEH_epilogue_sub_mod_384
+	DD	imagerel $L$SEH_info_sub_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_sub_mod_384
+	DD	imagerel $L$SEH_end_sub_mod_384
+	DD	imagerel $L$SEH_info_sub_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sub_mod_384x
+	DD	imagerel $L$SEH_body_sub_mod_384x
+	DD	imagerel $L$SEH_info_sub_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_sub_mod_384x
+	DD	imagerel $L$SEH_epilogue_sub_mod_384x
+	DD	imagerel $L$SEH_info_sub_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sub_mod_384x
+	DD	imagerel $L$SEH_end_sub_mod_384x
+	DD	imagerel $L$SEH_info_sub_mod_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x
+	DD	imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_body_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_end_sgn0_pty_mod_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_body_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue
+
+	DD	imagerel $L$SEH_body_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_end_sgn0_pty_mod_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_add_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_add_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_add_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_add_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_add_mod_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,003h,000h
+DB	000h,0e4h,004h,000h
+DB	000h,0d4h,005h,000h
+DB	000h,0c4h,006h,000h
+DB	000h,034h,007h,000h
+DB	000h,054h,008h,000h
+DB	000h,074h,00ah,000h
+DB	000h,064h,00bh,000h
+DB	000h,082h
+DB	000h,000h
+$L$SEH_info_add_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_rshift_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_rshift_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_rshift_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_div_by_2_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_div_by_2_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_div_by_2_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_lshift_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_lshift_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_lshift_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_3_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_3_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mul_by_3_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_8_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_8_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mul_by_8_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_3_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_3_mod_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mul_by_3_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_8_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_8_mod_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mul_by_8_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_cneg_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_cneg_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_cneg_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sub_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sub_mod_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sub_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sub_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sub_mod_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,003h,000h
+DB	000h,0e4h,004h,000h
+DB	000h,0d4h,005h,000h
+DB	000h,0c4h,006h,000h
+DB	000h,034h,007h,000h
+DB	000h,054h,008h,000h
+DB	000h,074h,00ah,000h
+DB	000h,064h,00bh,000h
+DB	000h,082h
+DB	000h,000h
+$L$SEH_info_sub_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_by_1_plus_i_mod_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,007h,000h
+DB	000h,0e4h,008h,000h
+DB	000h,0d4h,009h,000h
+DB	000h,0c4h,00ah,000h
+DB	000h,034h,00bh,000h
+DB	000h,054h,00ch,000h
+DB	000h,074h,00eh,000h
+DB	000h,064h,00fh,000h
+DB	000h,0c2h
+DB	000h,000h
+$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0_pty_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0_pty_mod_384_body::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+$L$SEH_info_sgn0_pty_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0_pty_mod_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0_pty_mod_384x_body::
+DB	1,0,9,0
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h
+$L$SEH_info_sgn0_pty_mod_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
new file mode 100644
index 00000000000..57d1752fd3c
--- /dev/null
+++ b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
@@ -0,0 +1,334 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+
+ALIGN	32
+__add_mod_384x384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+
+	add	r8,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[56+rsi]
+	adc	r9,QWORD PTR[8+rdx]
+	mov	rax,QWORD PTR[64+rsi]
+	adc	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[72+rsi]
+	adc	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[80+rsi]
+	adc	r12,QWORD PTR[32+rdx]
+	mov	rsi,QWORD PTR[88+rsi]
+	adc	r13,QWORD PTR[40+rdx]
+	mov	QWORD PTR[rdi],r8
+	adc	r14,QWORD PTR[48+rdx]
+	mov	QWORD PTR[8+rdi],r9
+	adc	r15,QWORD PTR[56+rdx]
+	mov	QWORD PTR[16+rdi],r10
+	adc	rax,QWORD PTR[64+rdx]
+	mov	QWORD PTR[32+rdi],r12
+	mov	r8,r14
+	adc	rbx,QWORD PTR[72+rdx]
+	mov	QWORD PTR[24+rdi],r11
+	mov	r9,r15
+	adc	rbp,QWORD PTR[80+rdx]
+	mov	QWORD PTR[40+rdi],r13
+	mov	r10,rax
+	adc	rsi,QWORD PTR[88+rdx]
+	mov	r11,rbx
+	sbb	rdx,rdx
+
+	sub	r14,QWORD PTR[rcx]
+	sbb	r15,QWORD PTR[8+rcx]
+	mov	r12,rbp
+	sbb	rax,QWORD PTR[16+rcx]
+	sbb	rbx,QWORD PTR[24+rcx]
+	sbb	rbp,QWORD PTR[32+rcx]
+	mov	r13,rsi
+	sbb	rsi,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r14,r8
+	cmovc	r15,r9
+	cmovc	rax,r10
+	mov	QWORD PTR[48+rdi],r14
+	cmovc	rbx,r11
+	mov	QWORD PTR[56+rdi],r15
+	cmovc	rbp,r12
+	mov	QWORD PTR[64+rdi],rax
+	cmovc	rsi,r13
+	mov	QWORD PTR[72+rdi],rbx
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rsi
+
+	DB	0F3h,0C3h		;repret
+__add_mod_384x384	ENDP
+
+
+ALIGN	32
+__sub_mod_384x384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[56+rsi]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	rax,QWORD PTR[64+rsi]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[72+rsi]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[80+rsi]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rsi,QWORD PTR[88+rsi]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	QWORD PTR[rdi],r8
+	sbb	r14,QWORD PTR[48+rdx]
+	mov	r8,QWORD PTR[rcx]
+	mov	QWORD PTR[8+rdi],r9
+	sbb	r15,QWORD PTR[56+rdx]
+	mov	r9,QWORD PTR[8+rcx]
+	mov	QWORD PTR[16+rdi],r10
+	sbb	rax,QWORD PTR[64+rdx]
+	mov	r10,QWORD PTR[16+rcx]
+	mov	QWORD PTR[24+rdi],r11
+	sbb	rbx,QWORD PTR[72+rdx]
+	mov	r11,QWORD PTR[24+rcx]
+	mov	QWORD PTR[32+rdi],r12
+	sbb	rbp,QWORD PTR[80+rdx]
+	mov	r12,QWORD PTR[32+rcx]
+	mov	QWORD PTR[40+rdi],r13
+	sbb	rsi,QWORD PTR[88+rdx]
+	mov	r13,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r8,rdx
+	and	r9,rdx
+	and	r10,rdx
+	and	r11,rdx
+	and	r12,rdx
+	and	r13,rdx
+
+	add	r14,r8
+	adc	r15,r9
+	mov	QWORD PTR[48+rdi],r14
+	adc	rax,r10
+	mov	QWORD PTR[56+rdi],r15
+	adc	rbx,r11
+	mov	QWORD PTR[64+rdi],rax
+	adc	rbp,r12
+	mov	QWORD PTR[72+rdi],rbx
+	adc	rsi,r13
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rsi
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384x384	ENDP
+
+PUBLIC	add_mod_384x384
+
+
+ALIGN	32
+add_mod_384x384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_add_mod_384x384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_add_mod_384x384::
+
+
+	call	__add_mod_384x384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_add_mod_384x384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_add_mod_384x384::
+add_mod_384x384	ENDP
+
+PUBLIC	sub_mod_384x384
+
+
+ALIGN	32
+sub_mod_384x384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sub_mod_384x384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sub_mod_384x384::
+
+
+	call	__sub_mod_384x384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sub_mod_384x384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sub_mod_384x384::
+sub_mod_384x384	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_add_mod_384x384
+	DD	imagerel $L$SEH_body_add_mod_384x384
+	DD	imagerel $L$SEH_info_add_mod_384x384_prologue
+
+	DD	imagerel $L$SEH_body_add_mod_384x384
+	DD	imagerel $L$SEH_epilogue_add_mod_384x384
+	DD	imagerel $L$SEH_info_add_mod_384x384_body
+
+	DD	imagerel $L$SEH_epilogue_add_mod_384x384
+	DD	imagerel $L$SEH_end_add_mod_384x384
+	DD	imagerel $L$SEH_info_add_mod_384x384_epilogue
+
+	DD	imagerel $L$SEH_begin_sub_mod_384x384
+	DD	imagerel $L$SEH_body_sub_mod_384x384
+	DD	imagerel $L$SEH_info_sub_mod_384x384_prologue
+
+	DD	imagerel $L$SEH_body_sub_mod_384x384
+	DD	imagerel $L$SEH_epilogue_sub_mod_384x384
+	DD	imagerel $L$SEH_info_sub_mod_384x384_body
+
+	DD	imagerel $L$SEH_epilogue_sub_mod_384x384
+	DD	imagerel $L$SEH_end_sub_mod_384x384
+	DD	imagerel $L$SEH_info_sub_mod_384x384_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_add_mod_384x384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_add_mod_384x384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_add_mod_384x384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sub_mod_384x384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sub_mod_384x384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sub_mod_384x384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def
new file mode 100644
index 00000000000..3fbb6b3a97d
--- /dev/null
+++ b/crypto/blst_src/build/win64/blst.def
@@ -0,0 +1,217 @@
+LIBRARY blst
+
+EXPORTS
+	blst_scalar_from_uint32
+	blst_uint32_from_scalar
+	blst_scalar_from_uint64
+	blst_uint64_from_scalar
+	blst_scalar_from_bendian
+	blst_bendian_from_scalar
+	blst_scalar_from_lendian
+	blst_lendian_from_scalar
+	blst_scalar_fr_check
+	blst_sk_check
+	blst_sk_add_n_check
+	blst_sk_sub_n_check
+	blst_sk_mul_n_check
+	blst_sk_inverse
+	blst_scalar_from_le_bytes
+	blst_scalar_from_be_bytes
+	blst_fr_add
+	blst_fr_sub
+	blst_fr_mul_by_3
+	blst_fr_lshift
+	blst_fr_rshift
+	blst_fr_mul
+	blst_fr_sqr
+	blst_fr_cneg
+	blst_fr_eucl_inverse
+	blst_fr_inverse
+	blst_fr_from_uint64
+	blst_uint64_from_fr
+	blst_fr_from_scalar
+	blst_scalar_from_fr
+	blst_fp_add
+	blst_fp_sub
+	blst_fp_mul_by_3
+	blst_fp_mul_by_8
+	blst_fp_lshift
+	blst_fp_mul
+	blst_fp_sqr
+	blst_fp_cneg
+	blst_fp_eucl_inverse
+	blst_fp_inverse
+	blst_fp_sqrt
+	blst_fp_from_uint32
+	blst_uint32_from_fp
+	blst_fp_from_uint64
+	blst_uint64_from_fp
+	blst_fp_from_bendian
+	blst_bendian_from_fp
+	blst_fp_from_lendian
+	blst_lendian_from_fp
+	blst_fp2_add
+	blst_fp2_sub
+	blst_fp2_mul_by_3
+	blst_fp2_mul_by_8
+	blst_fp2_lshift
+	blst_fp2_mul
+	blst_fp2_sqr
+	blst_fp2_cneg
+	blst_fp2_eucl_inverse
+	blst_fp2_inverse
+	blst_fp2_sqrt
+	blst_fp12_sqr
+	blst_fp12_cyclotomic_sqr
+	blst_fp12_mul
+	blst_fp12_mul_by_xy00z0
+	blst_fp12_conjugate
+	blst_fp12_inverse
+	blst_fp12_frobenius_map
+	blst_fp12_is_equal
+	blst_fp12_is_one
+	blst_fp12_in_group
+	blst_fp12_one
+	blst_p1_add
+	blst_p1_add_or_double
+	blst_p1_add_affine
+	blst_p1_add_or_double_affine
+	blst_p1_double
+	blst_p1_mult
+	blst_p1_cneg
+	blst_p1_to_affine
+	blst_p1_from_affine
+	blst_p1_on_curve
+	blst_p1_in_g1
+	blst_p1_is_equal
+	blst_p1_is_inf
+	blst_p1_generator
+	blst_p1_affine_on_curve
+	blst_p1_affine_in_g1
+	blst_p1_affine_is_equal
+	blst_p1_affine_is_inf
+	blst_p1_affine_generator
+	blst_p2_add
+	blst_p2_add_or_double
+	blst_p2_add_affine
+	blst_p2_add_or_double_affine
+	blst_p2_double
+	blst_p2_mult
+	blst_p2_cneg
+	blst_p2_to_affine
+	blst_p2_from_affine
+	blst_p2_on_curve
+	blst_p2_in_g2
+	blst_p2_is_equal
+	blst_p2_is_inf
+	blst_p2_generator
+	blst_p2_affine_on_curve
+	blst_p2_affine_in_g2
+	blst_p2_affine_is_equal
+	blst_p2_affine_is_inf
+	blst_p2_affine_generator
+	blst_p1s_to_affine
+	blst_p1s_add
+	blst_p1s_mult_wbits_precompute_sizeof
+	blst_p1s_mult_wbits_precompute
+	blst_p1s_mult_wbits_scratch_sizeof
+	blst_p1s_mult_wbits
+	blst_p1s_mult_pippenger_scratch_sizeof
+	blst_p1s_mult_pippenger
+	blst_p1s_tile_pippenger
+	blst_p2s_to_affine
+	blst_p2s_add
+	blst_p2s_mult_wbits_precompute_sizeof
+	blst_p2s_mult_wbits_precompute
+	blst_p2s_mult_wbits_scratch_sizeof
+	blst_p2s_mult_wbits
+	blst_p2s_mult_pippenger_scratch_sizeof
+	blst_p2s_mult_pippenger
+	blst_p2s_tile_pippenger
+	blst_map_to_g1
+	blst_map_to_g2
+	blst_encode_to_g1
+	blst_hash_to_g1
+	blst_encode_to_g2
+	blst_hash_to_g2
+	blst_p1_serialize
+	blst_p1_compress
+	blst_p1_affine_serialize
+	blst_p1_affine_compress
+	blst_p1_uncompress
+	blst_p1_deserialize
+	blst_p2_serialize
+	blst_p2_compress
+	blst_p2_affine_serialize
+	blst_p2_affine_compress
+	blst_p2_uncompress
+	blst_p2_deserialize
+	blst_keygen
+	blst_sk_to_pk_in_g1
+	blst_sign_pk_in_g1
+	blst_sk_to_pk_in_g2
+	blst_sign_pk_in_g2
+	blst_miller_loop
+	blst_final_exp
+	blst_precompute_lines
+	blst_miller_loop_lines
+	blst_fp12_finalverify
+	blst_pairing_sizeof
+	blst_pairing_init
+	blst_pairing_get_dst
+	blst_pairing_commit
+	blst_pairing_aggregate_pk_in_g2
+	blst_pairing_chk_n_aggr_pk_in_g2
+	blst_pairing_mul_n_aggregate_pk_in_g2
+	blst_pairing_chk_n_mul_n_aggr_pk_in_g2
+	blst_pairing_aggregate_pk_in_g1
+	blst_pairing_chk_n_aggr_pk_in_g1
+	blst_pairing_mul_n_aggregate_pk_in_g1
+	blst_pairing_chk_n_mul_n_aggr_pk_in_g1
+	blst_pairing_merge
+	blst_pairing_finalverify
+	blst_aggregate_in_g1
+	blst_aggregate_in_g2
+	blst_aggregated_in_g1
+	blst_aggregated_in_g2
+	blst_core_verify_pk_in_g1
+	blst_core_verify_pk_in_g2
+	BLS12_381_G1
+	BLS12_381_NEG_G1
+	BLS12_381_G2
+	BLS12_381_NEG_G2
+	blst_fr_to
+	blst_fr_from
+	blst_fp_to
+	blst_fp_from
+	blst_fp_is_square
+	blst_fp2_is_square
+	blst_p1_from_jacobian
+	blst_p2_from_jacobian
+	blst_sk_to_pk2_in_g1
+	blst_sign_pk2_in_g1
+	blst_sk_to_pk2_in_g2
+	blst_sign_pk2_in_g2
+	blst_uniq_sizeof
+	blst_uniq_init
+	blst_uniq_test
+	blst_expand_message_xmd
+	blst_p1_unchecked_mult
+	blst_p2_unchecked_mult
+	blst_pairing_raw_aggregate
+	blst_pairing_as_fp12
+	blst_bendian_from_fp12
+	blst_keygen_v3
+	blst_keygen_v4_5
+	blst_keygen_v5
+	blst_derive_master_eip2333
+	blst_derive_child_eip2333
+	blst_scalar_from_hexascii
+	blst_fr_from_hexascii
+	blst_fp_from_hexascii
+	blst_p1_sizeof
+	blst_p1_affine_sizeof
+	blst_p2_sizeof
+	blst_p2_affine_sizeof
+	blst_fp12_sizeof
+
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
new file mode 100644
index 00000000000..f3c2f0d05f9
--- /dev/null
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
@@ -0,0 +1,785 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+	EXPORT	|ct_inverse_mod_256|[FUNC]
+	ALIGN	32
+|ct_inverse_mod_256| PROC
+	DCDU	3573752639
+	stp	x29, x30, [sp,#-80]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	sub	sp, sp, #1040
+
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x6, x7, [x1,#8*2]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	str	x0, [sp]
+
+	ldp	x8, x9, [x2,#8*0]
+	ldp	x10, x11, [x2,#8*2]
+
+	stp	x4, x5, [x1,#8*0]	// copy input to |a|
+	stp	x6, x7, [x1,#8*2]
+	stp	x8, x9, [x1,#8*4]	// copy modulus to |b|
+	stp	x10, x11, [x1,#8*6]
+
+	////////////////////////////////////////// first iteration
+	bl	|$Lab_approximation_31_256_loaded|
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	str	x12,[x0,#8*8]		// initialize |u| with |f0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to dst |b|
+	bl	__smul_256_n_shift_by_31
+	str	x12, [x0,#8*9]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	ldr	x8, [x1,#8*8]		// |u|
+	ldr	x9, [x1,#8*13]	// |v|
+	madd	x4, x16, x8, xzr	// |u|*|f0|
+	madd	x4, x17, x9, x4	// |v|*|g0|
+	str	x4, [x0,#8*4]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*5]
+	stp	x5, x5, [x0,#8*7]
+
+	madd	x4, x12, x8, xzr	// |u|*|f1|
+	madd	x4, x13, x9, x4	// |v|*|g1|
+	str	x4, [x0,#8*9]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*10]
+	stp	x5, x5, [x0,#8*12]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	stp	x22, x22, [x0,#8*4]
+	stp	x22, x22, [x0,#8*6]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	bl	__ab_approximation_31_256
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_256_n_shift_by_31
+	mov	x16, x12			// corrected |f0|
+	mov	x17, x13			// corrected |g0|
+
+	mov	x12, x14			// |f1|
+	mov	x13, x15			// |g1|
+	add	x0, x0, #8*4	// pointer to destination |b|
+	bl	__smul_256_n_shift_by_31
+
+	add	x0, x0, #8*4	// pointer to destination |u|
+	bl	__smul_256x63
+	adc	x22, x22, x23
+	str	x22, [x0,#8*4]
+
+	mov	x16, x12			// corrected |f1|
+	mov	x17, x13			// corrected |g1|
+	add	x0, x0, #8*5	// pointer to destination |v|
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	////////////////////////////////////////// two[!] last iterations
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #47			// 31 + 512 % 31
+	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
+	ldr	x7, [x1,#8*0]		// just load
+	ldr	x11, [x1,#8*4]
+	bl	__inner_loop_62_256
+
+	mov	x16, x14
+	mov	x17, x15
+	ldr	x0, [sp]			// original out_ptr
+	bl	__smul_256x63
+	bl	__smul_512x63_tail
+	ldr	x30, [x29,#8]
+
+	smulh	x20, x7, x17		// figure out top-most limb
+	ldp	x8, x9, [x3,#8*0]
+	adc	x23, x23, x25
+	ldp	x10, x11, [x3,#8*2]
+
+	add	x20, x20, x23		// x20 is 1, 0 or -1
+	asr	x19, x20, #63		// sign as mask
+
+	and	x23,   x8, x19		// add mod<<256 conditionally
+	and	x24,   x9, x19
+	adds	x4, x4, x23
+	and	x25,   x10, x19
+	adcs	x5, x5, x24
+	and	x26,   x11, x19
+	adcs	x6, x6, x25
+	adcs	x7, x22,   x26
+	adc	x20, x20, xzr		// x20 is 1, 0 or -1
+
+	neg	x19, x20
+	orr	x20, x20, x19		// excess bit or sign as mask
+	asr	x19, x19, #63		// excess bit as mask
+
+	and	x8, x8, x20		// mask |mod|
+	and	x9, x9, x20
+	and	x10, x10, x20
+	and	x11, x11, x20
+
+	eor	x8, x8, x19		// conditionally negate |mod|
+	eor	x9, x9, x19
+	adds	x8, x8, x19, lsr#63
+	eor	x10, x10, x19
+	adcs	x9, x9, xzr
+	eor	x11, x11, x19
+	adcs	x10, x10, xzr
+	adc	x11, x11, xzr
+
+	adds	x4, x4, x8	// final adjustment for |mod|<<256
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*4]
+	adc	x7, x7, x11
+	stp	x6, x7, [x0,#8*6]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldr	x29, [sp],#80
+	DCDU	3573752767
+	ret
+	ENDP
+
+////////////////////////////////////////////////////////////////////////
+
+	ALIGN	32
+|__smul_256x63| PROC
+	ldp	x4, x5, [x1,#8*0+64]	// load |u| (or |v|)
+	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x6, x7, [x1,#8*2+64]
+	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x22, [x1,#8*4+64]
+
+	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
+	sub	x16, x16, x14
+	eor	x5, x5, x14
+	adds	x4, x4, x14, lsr#63
+	eor	x6, x6, x14
+	adcs	x5, x5, xzr
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	eor	x22, x22, x14
+	umulh	x19, x4, x16
+	adcs	x7, x7, xzr
+	umulh	x20, x5, x16
+	adcs	x22, x22, xzr
+	umulh	x21, x6, x16
+	mul	x4, x4, x16
+	cmp	x16, #0
+	mul	x5, x5, x16
+	cselne	x22,x22,xzr
+	mul	x6, x6, x16
+	adds	x5, x5, x19
+	mul	x24, x7, x16
+	adcs	x6, x6, x20
+	adcs	x24, x24, x21
+	adc	x26, xzr, xzr
+	ldp	x8, x9, [x1,#8*0+104]	// load |u| (or |v|)
+	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x10, x11, [x1,#8*2+104]
+	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
+	ldr	x23, [x1,#8*4+104]
+
+	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
+	sub	x17, x17, x14
+	eor	x9, x9, x14
+	adds	x8, x8, x14, lsr#63
+	eor	x10, x10, x14
+	adcs	x9, x9, xzr
+	eor	x11, x11, x14
+	adcs	x10, x10, xzr
+	eor	x23, x23, x14
+	umulh	x19, x8, x17
+	adcs	x11, x11, xzr
+	umulh	x20, x9, x17
+	adcs	x23, x23, xzr
+	umulh	x21, x10, x17
+	adc	x15, xzr, xzr		// used in __smul_512x63_tail
+	mul	x8, x8, x17
+	cmp	x17, #0
+	mul	x9, x9, x17
+	cselne	x23,x23,xzr
+	mul	x10, x10, x17
+	adds	x9, x9, x19
+	mul	x25, x11, x17
+	adcs	x10, x10, x20
+	adcs	x25, x25, x21
+	adc	x26, x26, xzr
+
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	stp	x4, x5, [x0,#8*0]
+	adcs	x24,   x24,   x25
+	stp	x6, x24, [x0,#8*2]
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__smul_512x63_tail| PROC
+	umulh	x24, x7, x16
+	ldp	x5, x6, [x1,#8*18]	// load rest of |v|
+	adc	x26, x26, xzr
+	ldr	x7, [x1,#8*20]
+	and	x22, x22, x16
+
+	umulh	x11, x11, x17	// resume |v|*|g1| chain
+
+	sub	x24, x24, x22	// tie up |u|*|f1| chain
+	asr	x25, x24, #63
+
+	eor	x5, x5, x14	// conditionally negate rest of |v|
+	eor	x6, x6, x14
+	adds	x5, x5, x15
+	eor	x7, x7, x14
+	adcs	x6, x6, xzr
+	umulh	x19, x23,   x17
+	adc	x7, x7, xzr
+	umulh	x20, x5, x17
+	add	x11, x11, x26
+	umulh	x21, x6, x17
+
+	mul	x4, x23,   x17
+	mul	x5, x5, x17
+	adds	x4, x4, x11
+	mul	x6, x6, x17
+	adcs	x5, x5, x19
+	mul	x22,   x7, x17
+	adcs	x6, x6, x20
+	adcs	x22,   x22,   x21
+	adc	x23, xzr, xzr		// used in the final step
+
+	adds	x4, x4, x24
+	adcs	x5, x5, x25
+	adcs	x6, x6, x25
+	stp	x4, x5, [x0,#8*4]
+	adcs	x22,   x22,   x25	// carry is used in the final step
+	stp	x6, x22,   [x0,#8*6]
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__smul_256_n_shift_by_31| PROC
+	ldp	x4, x5, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x6, x7, [x1,#8*2+0]
+	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x5, x5, x24
+	adds	x4, x4, x24, lsr#63
+	eor	x6, x6, x24
+	adcs	x5, x5, xzr
+	eor	x7, x7, x24
+	umulh	x19, x4, x25
+	adcs	x6, x6, xzr
+	umulh	x20, x5, x25
+	adc	x7, x7, xzr
+	umulh	x21, x6, x25
+	and	x24, x24, x25
+	umulh	x22, x7, x25
+	neg	x24, x24
+
+	mul	x4, x4, x25
+	mul	x5, x5, x25
+	mul	x6, x6, x25
+	adds	x5, x5, x19
+	mul	x7, x7, x25
+	adcs	x6, x6, x20
+	adcs	x7, x7, x21
+	adc	x22, x22, x24
+	ldp	x8, x9, [x1,#8*0+32]	// load |a| (or |b|)
+	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x10, x11, [x1,#8*2+32]
+	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)
+
+	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
+	sub	x25, x25, x24
+	eor	x9, x9, x24
+	adds	x8, x8, x24, lsr#63
+	eor	x10, x10, x24
+	adcs	x9, x9, xzr
+	eor	x11, x11, x24
+	umulh	x19, x8, x25
+	adcs	x10, x10, xzr
+	umulh	x20, x9, x25
+	adc	x11, x11, xzr
+	umulh	x21, x10, x25
+	and	x24, x24, x25
+	umulh	x23, x11, x25
+	neg	x24, x24
+
+	mul	x8, x8, x25
+	mul	x9, x9, x25
+	mul	x10, x10, x25
+	adds	x9, x9, x19
+	mul	x11, x11, x25
+	adcs	x10, x10, x20
+	adcs	x11, x11, x21
+	adc	x23, x23, x24
+	adds	x4, x4, x8
+	adcs	x5, x5, x9
+	adcs	x6, x6, x10
+	adcs	x7, x7, x11
+	adc	x8, x22,   x23
+
+	extr	x4, x5, x4, #31
+	extr	x5, x6, x5, #31
+	extr	x6, x7, x6, #31
+	asr	x23, x8, #63	// result's sign as mask
+	extr	x7, x8, x7, #31
+
+	eor	x4, x4, x23	// ensure the result is positive
+	eor	x5, x5, x23
+	adds	x4, x4, x23, lsr#63
+	eor	x6, x6, x23
+	adcs	x5, x5, xzr
+	eor	x7, x7, x23
+	adcs	x6, x6, xzr
+	stp	x4, x5, [x0,#8*0]
+	adc	x7, x7, xzr
+	stp	x6, x7, [x0,#8*2]
+
+	eor	x12, x12, x23		// adjust |f/g| accordingly
+	eor	x13, x13, x23
+	sub	x12, x12, x23
+	sub	x13, x13, x23
+
+	ret
+	ENDP
+
+	ALIGN	16
+|__ab_approximation_31_256| PROC
+	ldp	x6, x7, [x1,#8*2]
+	ldp	x10, x11, [x1,#8*6]
+	ldp	x4, x5, [x1,#8*0]
+	ldp	x8, x9, [x1,#8*4]
+
+|$Lab_approximation_31_256_loaded|
+	orr	x19, x7, x11	// check top-most limbs, ...
+	cmp	x19, #0
+	cselne	x7,x7,x6
+	cselne	x11,x11,x10
+	cselne	x6,x6,x5
+	orr	x19, x7, x11	// and ones before top-most, ...
+	cselne	x10,x10,x9
+
+	cmp	x19, #0
+	cselne	x7,x7,x6
+	cselne	x11,x11,x10
+	cselne	x6,x6,x4
+	orr	x19, x7, x11	// and one more, ...
+	cselne	x10,x10,x8
+
+	clz	x19, x19
+	cmp	x19, #64
+	cselne	x19,x19,xzr
+	cselne	x7,x7,x6
+	cselne	x11,x11,x10
+	neg	x20, x19
+
+	lslv	x7, x7, x19	// align high limbs to the left
+	lslv	x11, x11, x19
+	lsrv	x6, x6, x20
+	lsrv	x10, x10, x20
+	and	x6, x6, x20, asr#6
+	and	x10, x10, x20, asr#6
+	orr	x7, x7, x6
+	orr	x11, x11, x10
+
+	bfxil	x7, x4, #0, #31
+	bfxil	x11, x8, #0, #31
+
+	b	__inner_loop_31_256
+	ret
+	ENDP
+
+
+	ALIGN	16
+|__inner_loop_31_256| PROC
+	mov	x2, #31
+	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x23,#0x7FFFFFFF7FFFFFFF
+
+|$Loop_31_256|
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x15
+	cselhs	x11,x11,x7
+	cselhs	x7,x21,x20
+	cselhs	x15,x15,x13
+	cselhs	x13,x13,x19
+	lsr	x7, x7, #1
+	and	x19, x15, x22
+	and	x20, x23, x22
+	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x15, x15, x15	// |f1|<<=1
+	add	x13, x13, x20
+	sub	x15, x15, x23
+	cbnz	x2, |$Loop_31_256|
+
+	mov	x23, #0x7FFFFFFF
+	ubfx	x12, x13, #0, #32
+	ubfx	x13, x13, #32, #32
+	ubfx	x14, x15, #0, #32
+	ubfx	x15, x15, #32, #32
+	sub	x12, x12, x23		// remove bias
+	sub	x13, x13, x23
+	sub	x14, x14, x23
+	sub	x15, x15, x23
+
+	ret
+	ENDP
+
+
+	ALIGN	16
+|__inner_loop_62_256| PROC
+	mov	x12, #1		// |f0|=1
+	mov	x13, #0		// |g0|=0
+	mov	x14, #0		// |f1|=0
+	mov	x15, #1		// |g1|=1
+
+|$Loop_62_256|
+	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	and	x19, x11, x22
+	sub	x20, x11, x7	// |b_|-|a_|
+	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x19, x12
+	cselhs	x11,x11,x7
+	cselhs	x7,x21,x20
+	mov	x20, x13
+	cselhs	x12,x12,x14
+	cselhs	x14,x14,x19
+	cselhs	x13,x13,x15
+	cselhs	x15,x15,x20
+	lsr	x7, x7, #1
+	and	x19, x14, x22
+	and	x20, x15, x22
+	add	x14, x14, x14		// |f1|<<=1
+	add	x15, x15, x15		// |g1|<<=1
+	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, |$Loop_62_256|
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
new file mode 100644
index 00000000000..65665c9f17a
--- /dev/null
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
@@ -0,0 +1,1211 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	ct_inverse_mod_256
+
+ALIGN	32
+ct_inverse_mod_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_ct_inverse_mod_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,1072
+
+$L$SEH_body_ct_inverse_mod_256::
+
+
+	lea	rax,QWORD PTR[((48+511))+rsp]
+	and	rax,-512
+	mov	QWORD PTR[32+rsp],rdi
+	mov	QWORD PTR[40+rsp],rcx
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+
+	mov	r12,QWORD PTR[rdx]
+	mov	r13,QWORD PTR[8+rdx]
+	mov	r14,QWORD PTR[16+rdx]
+	mov	r15,QWORD PTR[24+rdx]
+
+	mov	QWORD PTR[rax],r8
+	mov	QWORD PTR[8+rax],r9
+	mov	QWORD PTR[16+rax],r10
+	mov	QWORD PTR[24+rax],r11
+
+	mov	QWORD PTR[32+rax],r12
+	mov	QWORD PTR[40+rax],r13
+	mov	QWORD PTR[48+rax],r14
+	mov	QWORD PTR[56+rax],r15
+	mov	rsi,rax
+
+
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+
+
+	mov	QWORD PTR[64+rdi],rdx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+
+
+	mov	QWORD PTR[72+rdi],rdx
+
+
+	xor	rsi,256
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+
+
+
+	mov	r8,QWORD PTR[64+rsi]
+	mov	r12,QWORD PTR[104+rsi]
+	mov	r9,r8
+	imul	r8,QWORD PTR[rsp]
+	mov	r13,r12
+	imul	r12,QWORD PTR[8+rsp]
+	add	r8,r12
+	mov	QWORD PTR[32+rdi],r8
+	sar	r8,63
+	mov	QWORD PTR[40+rdi],r8
+	mov	QWORD PTR[48+rdi],r8
+	mov	QWORD PTR[56+rdi],r8
+	mov	QWORD PTR[64+rdi],r8
+	lea	rsi,QWORD PTR[64+rsi]
+
+	imul	r9,rdx
+	imul	r13,rcx
+	add	r9,r13
+	mov	QWORD PTR[72+rdi],r9
+	sar	r9,63
+	mov	QWORD PTR[80+rdi],r9
+	mov	QWORD PTR[88+rdi],r9
+	mov	QWORD PTR[96+rdi],r9
+	mov	QWORD PTR[104+rdi],r9
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_256x63
+	sar	rbp,63
+	mov	QWORD PTR[40+rdi],rbp
+	mov	QWORD PTR[48+rdi],rbp
+	mov	QWORD PTR[56+rdi],rbp
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+	xor	rsi,256+8*8
+	mov	edx,31
+	call	__ab_approximation_31_256
+
+
+	mov	QWORD PTR[16+rsp],r12
+	mov	QWORD PTR[24+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256_n_shift_by_31
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[24+rsp],rcx
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[64+rsi]
+	lea	rdi,QWORD PTR[32+rdi]
+	call	__smulq_256x63
+
+	mov	rdx,QWORD PTR[16+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	lea	rdi,QWORD PTR[40+rdi]
+	call	__smulq_512x63
+
+	xor	rsi,256+8*8
+	mov	edx,47
+
+	mov	r8,QWORD PTR[rsi]
+
+	mov	r10,QWORD PTR[32+rsi]
+
+	call	__inner_loop_62_256
+
+
+
+
+
+
+
+	lea	rsi,QWORD PTR[64+rsi]
+
+
+
+
+
+	mov	rdx,r12
+	mov	rcx,r13
+	mov	rdi,QWORD PTR[32+rsp]
+	call	__smulq_512x63
+	adc	rdx,rbp
+
+	mov	rsi,QWORD PTR[40+rsp]
+	mov	rax,rdx
+	sar	rdx,63
+
+	mov	r8,rdx
+	mov	r9,rdx
+	and	r8,QWORD PTR[rsi]
+	mov	r10,rdx
+	and	r9,QWORD PTR[8+rsi]
+	and	r10,QWORD PTR[16+rsi]
+	and	rdx,QWORD PTR[24+rsi]
+
+	add	r12,r8
+	adc	r13,r9
+	adc	r14,r10
+	adc	r15,rdx
+	adc	rax,0
+
+	mov	rdx,rax
+	neg	rax
+	or	rdx,rax
+	sar	rax,63
+
+	mov	r8,rdx
+	mov	r9,rdx
+	and	r8,QWORD PTR[rsi]
+	mov	r10,rdx
+	and	r9,QWORD PTR[8+rsi]
+	and	r10,QWORD PTR[16+rsi]
+	and	rdx,QWORD PTR[24+rsi]
+
+	xor	r8,rax
+	xor	rcx,rcx
+	xor	r9,rax
+	sub	rcx,rax
+	xor	r10,rax
+	xor	rdx,rax
+	add	r8,rcx
+	adc	r9,0
+	adc	r10,0
+	adc	rdx,0
+
+	add	r12,r8
+	adc	r13,r9
+	adc	r14,r10
+	adc	r15,rdx
+
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	QWORD PTR[48+rdi],r14
+	mov	QWORD PTR[56+rdi],r15
+
+	lea	r8,QWORD PTR[1072+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_ct_inverse_mod_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ct_inverse_mod_256::
+ct_inverse_mod_256	ENDP
+
+ALIGN	32
+__smulq_512x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	rbp,QWORD PTR[32+rsi]
+
+	mov	rbx,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbx,rdx
+	add	rbx,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	rbp,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	rbp,0
+
+	mul	rbx
+	mov	QWORD PTR[rdi],rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	QWORD PTR[8+rdi],r9
+	mov	r10,rdx
+	mul	rbx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	QWORD PTR[16+rdi],r10
+	mov	r11,rdx
+	and	rbp,rbx
+	neg	rbp
+	mul	rbx
+	add	r11,rax
+	adc	rbp,rdx
+	mov	QWORD PTR[24+rdi],r11
+
+	mov	r8,QWORD PTR[40+rsi]
+	mov	r9,QWORD PTR[48+rsi]
+	mov	r10,QWORD PTR[56+rsi]
+	mov	r11,QWORD PTR[64+rsi]
+	mov	r12,QWORD PTR[72+rsi]
+	mov	r13,QWORD PTR[80+rsi]
+	mov	r14,QWORD PTR[88+rsi]
+	mov	r15,QWORD PTR[96+rsi]
+
+	mov	rdx,rcx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rcx,rdx
+	add	rcx,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	xor	r14,rdx
+	xor	r15,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+
+	mul	rcx
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rcx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rcx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rcx
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rcx
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	mul	rcx
+	add	r13,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r14,rdx
+	mul	rcx
+	add	r14,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+	imul	rcx
+	add	r15,rax
+	adc	rdx,0
+
+	mov	rbx,rbp
+	sar	rbp,63
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,rbx
+	adc	r13,rbp
+	adc	r14,rbp
+	adc	r15,rbp
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	QWORD PTR[48+rdi],r14
+	mov	QWORD PTR[56+rdi],r15
+
+	DB	0F3h,0C3h		;repret
+__smulq_512x63	ENDP
+
+
+ALIGN	32
+__smulq_256x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[((0+0))+rsi]
+	mov	r9,QWORD PTR[((0+8))+rsi]
+	mov	r10,QWORD PTR[((0+16))+rsi]
+	mov	r11,QWORD PTR[((0+24))+rsi]
+	mov	rbp,QWORD PTR[((0+32))+rsi]
+
+	mov	rbx,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbx,rdx
+	add	rbx,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	rbp,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	rbp,0
+
+	mul	rbx
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	and	rbp,rbx
+	neg	rbp
+	mul	rbx
+	add	r11,rax
+	adc	rbp,rdx
+	mov	rdx,rcx
+	mov	r12,QWORD PTR[((40+0))+rsi]
+	mov	r13,QWORD PTR[((40+8))+rsi]
+	mov	r14,QWORD PTR[((40+16))+rsi]
+	mov	r15,QWORD PTR[((40+24))+rsi]
+	mov	rcx,QWORD PTR[((40+32))+rsi]
+
+	mov	rbx,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbx,rdx
+	add	rbx,rax
+
+	xor	r12,rdx
+	xor	r13,rdx
+	xor	r14,rdx
+	xor	r15,rdx
+	xor	rcx,rdx
+	add	rax,r12
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rcx,0
+
+	mul	rbx
+	mov	r12,rax
+	mov	rax,r13
+	mov	r13,rdx
+	mul	rbx
+	add	r13,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r14,rdx
+	mul	rbx
+	add	r14,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+	and	rcx,rbx
+	neg	rcx
+	mul	rbx
+	add	r15,rax
+	adc	rcx,rdx
+	add	r8,r12
+	adc	r9,r13
+	adc	r10,r14
+	adc	r11,r15
+	adc	rbp,rcx
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],rbp
+
+	DB	0F3h,0C3h		;repret
+__smulq_256x63	ENDP
+
+ALIGN	32
+__smulq_256_n_shift_by_31	PROC PRIVATE
+	DB	243,15,30,250
+	mov	QWORD PTR[rdi],rdx
+	mov	QWORD PTR[8+rdi],rcx
+	mov	rbp,rdx
+	mov	r8,QWORD PTR[((0+0))+rsi]
+	mov	r9,QWORD PTR[((0+8))+rsi]
+	mov	r10,QWORD PTR[((0+16))+rsi]
+	mov	r11,QWORD PTR[((0+24))+rsi]
+
+	mov	rbx,rbp
+	sar	rbp,63
+	xor	rax,rax
+	sub	rax,rbp
+
+	xor	rbx,rbp
+	add	rbx,rax
+
+	xor	r8,rbp
+	xor	r9,rbp
+	xor	r10,rbp
+	xor	r11,rbp
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+
+	mul	rbx
+	mov	r8,rax
+	mov	rax,r9
+	and	rbp,rbx
+	neg	rbp
+	mov	r9,rdx
+	mul	rbx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbx
+	add	r11,rax
+	adc	rbp,rdx
+	mov	r12,QWORD PTR[((32+0))+rsi]
+	mov	r13,QWORD PTR[((32+8))+rsi]
+	mov	r14,QWORD PTR[((32+16))+rsi]
+	mov	r15,QWORD PTR[((32+24))+rsi]
+
+	mov	rbx,rcx
+	sar	rcx,63
+	xor	rax,rax
+	sub	rax,rcx
+
+	xor	rbx,rcx
+	add	rbx,rax
+
+	xor	r12,rcx
+	xor	r13,rcx
+	xor	r14,rcx
+	xor	r15,rcx
+	add	rax,r12
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+
+	mul	rbx
+	mov	r12,rax
+	mov	rax,r13
+	and	rcx,rbx
+	neg	rcx
+	mov	r13,rdx
+	mul	rbx
+	add	r13,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r14,rdx
+	mul	rbx
+	add	r14,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+	mul	rbx
+	add	r15,rax
+	adc	rcx,rdx
+	add	r8,r12
+	adc	r9,r13
+	adc	r10,r14
+	adc	r11,r15
+	adc	rbp,rcx
+
+	mov	rdx,QWORD PTR[rdi]
+	mov	rcx,QWORD PTR[8+rdi]
+
+	shrd	r8,r9,31
+	shrd	r9,r10,31
+	shrd	r10,r11,31
+	shrd	r11,rbp,31
+
+	sar	rbp,63
+	xor	rax,rax
+	sub	rax,rbp
+
+	xor	r8,rbp
+	xor	r9,rbp
+	xor	r10,rbp
+	xor	r11,rbp
+	add	r8,rax
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	xor	rdx,rbp
+	xor	rcx,rbp
+	add	rdx,rax
+	add	rcx,rax
+
+	DB	0F3h,0C3h		;repret
+__smulq_256_n_shift_by_31	ENDP
+
+ALIGN	32
+__ab_approximation_31_256	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r9,QWORD PTR[24+rsi]
+	mov	r11,QWORD PTR[56+rsi]
+	mov	rbx,QWORD PTR[16+rsi]
+	mov	rbp,QWORD PTR[48+rsi]
+	mov	r8,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[40+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	mov	r8,QWORD PTR[rsi]
+	cmovz	rbp,r10
+	mov	r10,QWORD PTR[32+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	cmovz	rbp,r10
+
+	mov	rax,r9
+	or	rax,r11
+	bsr	rcx,rax
+	lea	rcx,QWORD PTR[1+rcx]
+	cmovz	r9,r8
+	cmovz	r11,r10
+	cmovz	rcx,rax
+	neg	rcx
+
+
+	shld	r9,rbx,cl
+	shld	r11,rbp,cl
+
+	mov	eax,07FFFFFFFh
+	and	r8,rax
+	and	r10,rax
+	not	rax
+	and	r9,rax
+	and	r11,rax
+	or	r8,r9
+	or	r10,r11
+
+	jmp	__inner_loop_31_256
+
+	DB	0F3h,0C3h		;repret
+__ab_approximation_31_256	ENDP
+
+ALIGN	32
+__inner_loop_31_256	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rcx,07FFFFFFF80000000h
+	mov	r13,0800000007FFFFFFFh
+	mov	r15,07FFFFFFF7FFFFFFFh
+
+$L$oop_31_256::
+	cmp	r8,r10
+	mov	rax,r8
+	mov	rbx,r10
+	mov	rbp,rcx
+	mov	r14,r13
+	cmovb	r8,r10
+	cmovb	r10,rax
+	cmovb	rcx,r13
+	cmovb	r13,rbp
+
+	sub	r8,r10
+	sub	rcx,r13
+	add	rcx,r15
+
+	test	rax,1
+	cmovz	r8,rax
+	cmovz	r10,rbx
+	cmovz	rcx,rbp
+	cmovz	r13,r14
+
+	shr	r8,1
+	add	r13,r13
+	sub	r13,r15
+	sub	edx,1
+	jnz	$L$oop_31_256
+
+	shr	r15,32
+	mov	edx,ecx
+	mov	r12d,r13d
+	shr	rcx,32
+	shr	r13,32
+	sub	rdx,r15
+	sub	rcx,r15
+	sub	r12,r15
+	sub	r13,r15
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_31_256	ENDP
+
+
+ALIGN	32
+__inner_loop_62_256	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r15d,edx
+	mov	rdx,1
+	xor	rcx,rcx
+	xor	r12,r12
+	mov	r13,rdx
+	mov	r14,rdx
+
+$L$oop_62_256::
+	xor	rax,rax
+	test	r8,r14
+	mov	rbx,r10
+	cmovnz	rax,r10
+	sub	rbx,r8
+	mov	rbp,r8
+	sub	r8,rax
+	cmovc	r8,rbx
+	cmovc	r10,rbp
+	mov	rax,rdx
+	cmovc	rdx,r12
+	cmovc	r12,rax
+	mov	rbx,rcx
+	cmovc	rcx,r13
+	cmovc	r13,rbx
+	xor	rax,rax
+	xor	rbx,rbx
+	shr	r8,1
+	test	rbp,r14
+	cmovnz	rax,r12
+	cmovnz	rbx,r13
+	add	r12,r12
+	add	r13,r13
+	sub	rdx,rax
+	sub	rcx,rbx
+	sub	r15d,1
+	jnz	$L$oop_62_256
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_62_256	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_ct_inverse_mod_256
+	DD	imagerel $L$SEH_body_ct_inverse_mod_256
+	DD	imagerel $L$SEH_info_ct_inverse_mod_256_prologue
+
+	DD	imagerel $L$SEH_body_ct_inverse_mod_256
+	DD	imagerel $L$SEH_epilogue_ct_inverse_mod_256
+	DD	imagerel $L$SEH_info_ct_inverse_mod_256_body
+
+	DD	imagerel $L$SEH_epilogue_ct_inverse_mod_256
+	DD	imagerel $L$SEH_end_ct_inverse_mod_256
+	DD	imagerel $L$SEH_info_ct_inverse_mod_256_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_ct_inverse_mod_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_ct_inverse_mod_256_body::
+DB	1,0,18,0
+DB	000h,0f4h,086h,000h
+DB	000h,0e4h,087h,000h
+DB	000h,0d4h,088h,000h
+DB	000h,0c4h,089h,000h
+DB	000h,034h,08ah,000h
+DB	000h,054h,08bh,000h
+DB	000h,074h,08dh,000h
+DB	000h,064h,08eh,000h
+DB	000h,001h,08ch,000h
+$L$SEH_info_ct_inverse_mod_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
new file mode 100644
index 00000000000..4ab12e052df
--- /dev/null
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
@@ -0,0 +1,718 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+	EXPORT	|ct_inverse_mod_383|[FUNC]
+	ALIGN	32
+|ct_inverse_mod_383| PROC
+	DCDU	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #1040
+
+	ldp	x22,   x4, [x1,#8*0]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x7, x8, [x1,#8*4]
+
+	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
+	and	x1, x1, #-512	// in the frame...
+	stp	x0, x3, [sp]
+
+	ldp	x9, x10, [x2,#8*0]
+	ldp	x11, x12, [x2,#8*2]
+	ldp	x13, x14, [x2,#8*4]
+
+	stp	x22,   x4, [x1,#8*0]	// copy input to |a|
+	stp	x5, x6, [x1,#8*2]
+	stp	x7, x8, [x1,#8*4]
+	stp	x9, x10, [x1,#8*6]	// copy modulus to |b|
+	stp	x11, x12, [x1,#8*8]
+	stp	x13, x14, [x1,#8*10]
+
+	////////////////////////////////////////// first iteration
+	mov	x2, #62
+	bl	|$Lab_approximation_62_loaded|
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	str	x15,[x0,#8*12]		// initialize |u| with |f0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to dst |b|
+	bl	__smul_383_n_shift_by_62
+	str	x15, [x0,#8*12]		// initialize |v| with |f1|
+
+	////////////////////////////////////////// second iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	ldr	x7, [x1,#8*12]	// |u|
+	ldr	x8, [x1,#8*18]	// |v|
+	mul	x3, x20, x7		// |u|*|f0|
+	smulh	x4, x20, x7
+	mul	x5, x21, x8		// |v|*|g0|
+	smulh	x6, x21, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*6]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*8]
+	stp	x5, x5, [x0,#8*10]
+
+	mul	x3, x15, x7		// |u|*|f1|
+	smulh	x4, x15, x7
+	mul	x5, x16, x8		// |v|*|g1|
+	smulh	x6, x16, x8
+	adds	x3, x3, x5
+	adc	x4, x4, x6
+	stp	x3, x4, [x0,#8*12]
+	asr	x5, x4, #63		// sign extenstion
+	stp	x5, x5, [x0,#8*14]
+	stp	x5, x5, [x0,#8*16]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	asr	x27, x27, #63		// sign extension
+	stp	x27, x27, [x0,#8*6]
+	stp	x27, x27, [x0,#8*8]
+	stp	x27, x27, [x0,#8*10]
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	bl	__ab_approximation_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	bl	__smul_383_n_shift_by_62
+	mov	x20, x15			// corrected |f0|
+	mov	x21, x16			// corrected |g0|
+
+	mov	x15, x17			// |f1|
+	mov	x16, x19			// |g1|
+	add	x0, x0, #8*6	// pointer to destination |b|
+	bl	__smul_383_n_shift_by_62
+
+	add	x0, x0, #8*6	// pointer to destination |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// corrected |f1|
+	mov	x21, x16			// corrected |g1|
+	add	x0, x0, #8*6	// pointer to destination |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	////////////////////////////////////////// iteration before last
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldp	x3, x8, [x1,#8*0]	// just load
+	ldp	x9, x14, [x1,#8*6]
+	bl	__inner_loop_62
+
+	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
+	str	x3, [x0,#8*0]
+	str	x9, [x0,#8*6]
+
+	mov	x20, x15			// exact |f0|
+	mov	x21, x16			// exact |g0|
+	mov	x15, x17
+	mov	x16, x19
+	add	x0, x0, #8*12	// pointer to dst |u|
+	bl	__smul_383x63
+
+	mov	x20, x15			// exact |f1|
+	mov	x21, x16			// exact |g1|
+	add	x0, x0, #8*6	// pointer to dst |v|
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+
+	////////////////////////////////////////// last iteration
+	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
+	mov	x2, #22			// 766 % 62
+	//bl	__ab_approximation_62		// |a| and |b| are exact,
+	ldr	x3, [x1,#8*0]		// just load
+	eor	x8, x8, x8
+	ldr	x9, [x1,#8*6]
+	eor	x14, x14, x14
+	bl	__inner_loop_62
+
+	mov	x20, x17
+	mov	x21, x19
+	ldp	x0, x15, [sp]		// original out_ptr and n_ptr
+	bl	__smul_383x63
+	bl	__smul_767x63_tail
+	ldr	x30, [x29,#8]
+
+	asr	x22, x8, #63		// sign as mask
+	ldp	x9, x10, [x15,#8*0]
+	ldp	x11, x12, [x15,#8*2]
+	ldp	x13, x14, [x15,#8*4]
+
+	and	x9, x9, x22		// add mod<<384 conditionally
+	and	x10, x10, x22
+	adds	x3, x3, x9
+	and	x11, x11, x22
+	adcs	x4, x4, x10
+	and	x12, x12, x22
+	adcs	x5, x5, x11
+	and	x13, x13, x22
+	adcs	x6, x6, x12
+	and	x14, x14, x22
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x14
+	stp	x7, x8, [x0,#8*10]
+
+	add	sp, sp, #1040
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+////////////////////////////////////////////////////////////////////////
+// see corresponding commentary in ctx_inverse_mod_384-x86_64...
+
+	ALIGN	32
+|__smul_383x63| PROC
+	ldp	x3, x4, [x1,#8*0+96]	// load |u| (or |v|)
+	asr	x17, x20, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x5, x6, [x1,#8*2+96]
+	eor	x20, x20, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x7, x8, [x1,#8*4+96]
+
+	eor	x3, x3, x17	// conditionally negate |u| (or |v|)
+	sub	x20, x20, x17
+	eor	x4, x4, x17
+	adds	x3, x3, x17, lsr#63
+	eor	x5, x5, x17
+	adcs	x4, x4, xzr
+	eor	x6, x6, x17
+	adcs	x5, x5, xzr
+	eor	x7, x7, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x20
+	eor	x8, x8, x17
+	umulh	x23, x4, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x5, x20
+	adcs	x8, x8, xzr
+	umulh	x25, x6, x20
+	umulh	x26, x7, x20
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x22
+	mul	x6, x6, x20
+	adcs	x5, x5, x23
+	mul	x7, x7, x20
+	adcs	x6, x6, x24
+	mul	x27,x8, x20
+	adcs	x7, x7, x25
+	adcs	x27,x27,x26
+	adc	x2, xzr, xzr
+	ldp	x9, x10, [x1,#8*0+144]	// load |u| (or |v|)
+	asr	x17, x21, #63		// |f_|'s sign as mask (or |g_|'s)
+	ldp	x11, x12, [x1,#8*2+144]
+	eor	x21, x21, x17		// conditionally negate |f_| (or |g_|)
+	ldp	x13, x14, [x1,#8*4+144]
+
+	eor	x9, x9, x17	// conditionally negate |u| (or |v|)
+	sub	x21, x21, x17
+	eor	x10, x10, x17
+	adds	x9, x9, x17, lsr#63
+	eor	x11, x11, x17
+	adcs	x10, x10, xzr
+	eor	x12, x12, x17
+	adcs	x11, x11, xzr
+	eor	x13, x13, x17
+	adcs	x12, x12, xzr
+	umulh	x22, x9, x21
+	eor	x14, x14, x17
+	umulh	x23, x10, x21
+	adcs	x13, x13, xzr
+	umulh	x24, x11, x21
+	adcs	x14, x14, xzr
+	umulh	x25, x12, x21
+	adc	x19, xzr, xzr		// used in __smul_767x63_tail
+	umulh	x26, x13, x21
+	mul	x9, x9, x21
+	mul	x10, x10, x21
+	mul	x11, x11, x21
+	adds	x10, x10, x22
+	mul	x12, x12, x21
+	adcs	x11, x11, x23
+	mul	x13, x13, x21
+	adcs	x12, x12, x24
+	mul	x28,x14, x21
+	adcs	x13, x13, x25
+	adcs	x28,x28,x26
+	adc	x2, x2, xzr
+
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, x13
+	stp	x5, x6, [x0,#8*2]
+	adcs	x27,   x27,   x28
+	stp	x7, x27,   [x0,#8*4]
+	adc	x28,   x2,   xzr	// used in __smul_767x63_tail
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__smul_767x63_tail| PROC
+	smulh	x27,   x8, x20
+	ldp	x3, x4, [x1,#8*24]	// load rest of |v|
+	umulh	x14,x14, x21
+	ldp	x5, x6, [x1,#8*26]
+	ldp	x7, x8, [x1,#8*28]
+
+	eor	x3, x3, x17	// conditionally negate rest of |v|
+	eor	x4, x4, x17
+	eor	x5, x5, x17
+	adds	x3, x3, x19
+	eor	x6, x6, x17
+	adcs	x4, x4, xzr
+	eor	x7, x7, x17
+	adcs	x5, x5, xzr
+	eor	x8, x8, x17
+	adcs	x6, x6, xzr
+	umulh	x22, x3, x21
+	adcs	x7, x7, xzr
+	umulh	x23, x4, x21
+	adc	x8, x8, xzr
+
+	umulh	x24, x5, x21
+	add	x14, x14, x28
+	umulh	x25, x6, x21
+	asr	x28, x27, #63
+	umulh	x26, x7, x21
+	mul	x3, x3, x21
+	mul	x4, x4, x21
+	mul	x5, x5, x21
+	adds	x3, x3, x14
+	mul	x6, x6, x21
+	adcs	x4, x4, x22
+	mul	x7, x7, x21
+	adcs	x5, x5, x23
+	mul	x8, x8, x21
+	adcs	x6, x6, x24
+	adcs	x7, x7, x25
+	adc	x8, x8, x26
+
+	adds	x3, x3, x27
+	adcs	x4, x4, x28
+	adcs	x5, x5, x28
+	adcs	x6, x6, x28
+	stp	x3, x4, [x0,#8*6]
+	adcs	x7, x7, x28
+	stp	x5, x6, [x0,#8*8]
+	adc	x8, x8, x28
+	stp	x7, x8, [x0,#8*10]
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__smul_383_n_shift_by_62| PROC
+	ldp	x3, x4, [x1,#8*0+0]	// load |a| (or |b|)
+	asr	x28, x15, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x5, x6, [x1,#8*2+0]
+	eor	x2, x15, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x7, x8, [x1,#8*4+0]
+
+	eor	x3, x3, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	umulh	x22, x3, x2
+	adcs	x6, x6, xzr
+	umulh	x23, x4, x2
+	eor	x8, x8, x28
+	umulh	x24, x5, x2
+	adcs	x7, x7, xzr
+	umulh	x25, x6, x2
+	adc	x8, x8, xzr
+
+	umulh	x26, x7, x2
+	smulh	x27, x8, x2
+	mul	x3, x3, x2
+	mul	x4, x4, x2
+	mul	x5, x5, x2
+	adds	x4, x4, x22
+	mul	x6, x6, x2
+	adcs	x5, x5, x23
+	mul	x7, x7, x2
+	adcs	x6, x6, x24
+	mul	x8, x8, x2
+	adcs	x7, x7, x25
+	adcs	x8, x8 ,x26
+	adc	x27, x27, xzr
+	ldp	x9, x10, [x1,#8*0+48]	// load |a| (or |b|)
+	asr	x28, x16, #63		// |f0|'s sign as mask (or |g0|'s)
+	ldp	x11, x12, [x1,#8*2+48]
+	eor	x2, x16, x28	// conditionally negate |f0| (or |g0|)
+	ldp	x13, x14, [x1,#8*4+48]
+
+	eor	x9, x9, x28	// conditionally negate |a| (or |b|)
+	sub	x2, x2, x28
+	eor	x10, x10, x28
+	adds	x9, x9, x28, lsr#63
+	eor	x11, x11, x28
+	adcs	x10, x10, xzr
+	eor	x12, x12, x28
+	adcs	x11, x11, xzr
+	eor	x13, x13, x28
+	umulh	x22, x9, x2
+	adcs	x12, x12, xzr
+	umulh	x23, x10, x2
+	eor	x14, x14, x28
+	umulh	x24, x11, x2
+	adcs	x13, x13, xzr
+	umulh	x25, x12, x2
+	adc	x14, x14, xzr
+
+	umulh	x26, x13, x2
+	smulh	x28, x14, x2
+	mul	x9, x9, x2
+	mul	x10, x10, x2
+	mul	x11, x11, x2
+	adds	x10, x10, x22
+	mul	x12, x12, x2
+	adcs	x11, x11, x23
+	mul	x13, x13, x2
+	adcs	x12, x12, x24
+	mul	x14, x14, x2
+	adcs	x13, x13, x25
+	adcs	x14, x14 ,x26
+	adc	x28, x28, xzr
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x27,   x28
+
+	extr	x3, x4, x3, #62
+	extr	x4, x5, x4, #62
+	extr	x5, x6, x5, #62
+	asr	x28, x9, #63
+	extr	x6, x7, x6, #62
+	extr	x7, x8, x7, #62
+	extr	x8, x9, x8, #62
+
+	eor	x3, x3, x28
+	eor	x4, x4, x28
+	adds	x3, x3, x28, lsr#63
+	eor	x5, x5, x28
+	adcs	x4, x4, xzr
+	eor	x6, x6, x28
+	adcs	x5, x5, xzr
+	eor	x7, x7, x28
+	adcs	x6, x6, xzr
+	eor	x8, x8, x28
+	stp	x3, x4, [x0,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x0,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x0,#8*4]
+
+	eor	x15, x15, x28
+	eor	x16, x16, x28
+	sub	x15, x15, x28
+	sub	x16, x16, x28
+
+	ret
+	ENDP
+
+	ALIGN	16
+|__ab_approximation_62| PROC
+	ldp	x7, x8, [x1,#8*4]
+	ldp	x13, x14, [x1,#8*10]
+	ldp	x5, x6, [x1,#8*2]
+	ldp	x11, x12, [x1,#8*8]
+
+|$Lab_approximation_62_loaded|
+	orr	x22, x8, x14	// check top-most limbs, ...
+	cmp	x22, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x6
+	orr	x22, x8, x14	// ... ones before top-most, ...
+	cselne	x13,x13,x12
+
+	ldp	x3, x4, [x1,#8*0]
+	ldp	x9, x10, [x1,#8*6]
+
+	cmp	x22, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x5
+	orr	x22, x8, x14	// ... and ones before that ...
+	cselne	x13,x13,x11
+
+	cmp	x22, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x4
+	orr	x22, x8, x14
+	cselne	x13,x13,x10
+
+	clz	x22, x22
+	cmp	x22, #64
+	cselne	x22,x22,xzr
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	neg	x23, x22
+
+	lslv	x8, x8, x22	// align high limbs to the left
+	lslv	x14, x14, x22
+	lsrv	x7, x7, x23
+	lsrv	x13, x13, x23
+	and	x7, x7, x23, asr#6
+	and	x13, x13, x23, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	b	__inner_loop_62
+	ret
+	ENDP
+
+	ALIGN	16
+|__inner_loop_62| PROC
+	mov	x15, #1		// |f0|=1
+	mov	x16, #0		// |g0|=0
+	mov	x17, #0		// |f1|=0
+	mov	x19, #1		// |g1|=1
+
+|$Loop_62|
+	sbfx	x28, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	sub	x2, x2, #1
+	subs	x24, x9, x3	// |b_|-|a_|
+	and	x22, x9, x28
+	sbc	x25, x14, x8
+	and	x23, x14, x28
+	subs	x26, x3, x22	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	mov	x22, x15
+	sbcs	x27, x8, x23
+	mov	x23, x16
+	cselhs	x9,x9,x3
+	cselhs	x14,x14,x8
+	cselhs	x3,x26,x24
+	cselhs	x8,x27,x25
+	cselhs	x15,x15,x17
+	cselhs	x17,x17,x22
+	cselhs	x16,x16,x19
+	cselhs	x19,x19,x23
+	extr	x3, x8, x3, #1
+	lsr	x8, x8, #1
+	and	x22, x17, x28
+	and	x23, x19, x28
+	add	x17, x17, x17		// |f1|<<=1
+	add	x19, x19, x19		// |g1|<<=1
+	sub	x15, x15, x22		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	sub	x16, x16, x23		// |g0|-=|g1| (or |g0-=0| ...)
+	cbnz	x2, |$Loop_62|
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
new file mode 100644
index 00000000000..ab72328f056
--- /dev/null
+++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
@@ -0,0 +1,325 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+	EXPORT	|ct_is_square_mod_384|[FUNC]
+	ALIGN	32
+|ct_is_square_mod_384| PROC
+	DCDU	3573752639
+	stp	x29, x30, [sp,#-128]!
+	add	x29, sp, #0
+	stp	x19, x20, [sp,#16]
+	stp	x21, x22, [sp,#32]
+	stp	x23, x24, [sp,#48]
+	stp	x25, x26, [sp,#64]
+	stp	x27, x28, [sp,#80]
+	sub	sp, sp, #512
+
+	ldp	x3, x4, [x0,#8*0]		// load input
+	ldp	x5, x6, [x0,#8*2]
+	ldp	x7, x8, [x0,#8*4]
+
+	add	x0, sp, #255	// find closest 256-byte-aligned spot
+	and	x0, x0, #-256	// in the frame...
+
+	ldp	x9, x10, [x1,#8*0]		// load modulus
+	ldp	x11, x12, [x1,#8*2]
+	ldp	x13, x14, [x1,#8*4]
+
+	stp	x3, x4, [x0,#8*6]	// copy input to |a|
+	stp	x5, x6, [x0,#8*8]
+	stp	x7, x8, [x0,#8*10]
+	stp	x9, x10, [x0,#8*0]	// copy modulus to |b|
+	stp	x11, x12, [x0,#8*2]
+	stp	x13, x14, [x0,#8*4]
+
+	eor	x2, x2, x2			// init the |$Legendre| symbol
+	mov	x15, #24			// 24 is 768/30-1
+	b	|$Loop_is_square|
+
+	ALIGN	16
+|$Loop_is_square|
+	bl	__ab_approximation_30
+	sub	x15, x15, #1
+
+	eor	x1, x0, #128		// pointer to dst |b|
+	bl	__smul_384_n_shift_by_30
+
+	mov	x19, x16			// |f0|
+	mov	x20, x17			// |g0|
+	add	x1, x1, #8*6	// pointer to dst |a|
+	bl	__smul_384_n_shift_by_30
+
+	ldp	x9, x10, [x1,#-8*6]
+	eor	x0, x0, #128		// flip-flop src |a|b|
+	and	x27, x27, x9		// if |a| was negative,
+	add	x2, x2, x27, lsr#1		// adjust |L|
+
+	cbnz	x15, |$Loop_is_square|
+
+	////////////////////////////////////////// last iteration
+	//bl	__ab_approximation_30		// |a| and |b| are exact,
+	//ldr	x8, [x0,#8*6]		// and loaded
+	//ldr	x14, [x0,#8*0]
+	mov	x15, #48			// 48 is 768%30 + 30
+	bl	__inner_loop_48
+	ldr	x30, [x29,#8]
+
+	and	x0, x2, #1
+	eor	x0, x0, #1
+
+	add	sp, sp, #512
+	ldp	x19, x20, [x29,#16]
+	ldp	x21, x22, [x29,#32]
+	ldp	x23, x24, [x29,#48]
+	ldp	x25, x26, [x29,#64]
+	ldp	x27, x28, [x29,#80]
+	ldr	x29, [sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__smul_384_n_shift_by_30| PROC
+	ldp	x3, x4, [x0,#8*0+0]	// load |b| (or |a|)
+	asr	x27, x20, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x5, x6, [x0,#8*2+0]
+	eor	x20, x20, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x7, x8, [x0,#8*4+0]
+
+	eor	x3, x3, x27	// conditionally negate |b| (or |a|)
+	sub	x20, x20, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	umulh	x21, x3, x20
+	adcs	x6, x6, xzr
+	umulh	x22, x4, x20
+	eor	x8, x8, x27
+	umulh	x23, x5, x20
+	adcs	x7, x7, xzr
+	umulh	x24, x6, x20
+	adc	x8, x8, xzr
+
+	umulh	x25, x7, x20
+	and	x28, x20, x27
+	umulh	x26, x8, x20
+	neg	x28, x28
+	mul	x3, x3, x20
+	mul	x4, x4, x20
+	mul	x5, x5, x20
+	adds	x4, x4, x21
+	mul	x6, x6, x20
+	adcs	x5, x5, x22
+	mul	x7, x7, x20
+	adcs	x6, x6, x23
+	mul	x8, x8, x20
+	adcs	x7, x7, x24
+	adcs	x8, x8 ,x25
+	adc	x26, x26, x28
+	ldp	x9, x10, [x0,#8*0+48]	// load |b| (or |a|)
+	asr	x27, x19, #63		// |g1|'s sign as mask (or |f1|'s)
+	ldp	x11, x12, [x0,#8*2+48]
+	eor	x19, x19, x27		// conditionally negate |g1| (or |f1|)
+	ldp	x13, x14, [x0,#8*4+48]
+
+	eor	x9, x9, x27	// conditionally negate |b| (or |a|)
+	sub	x19, x19, x27
+	eor	x10, x10, x27
+	adds	x9, x9, x27, lsr#63
+	eor	x11, x11, x27
+	adcs	x10, x10, xzr
+	eor	x12, x12, x27
+	adcs	x11, x11, xzr
+	eor	x13, x13, x27
+	umulh	x21, x9, x19
+	adcs	x12, x12, xzr
+	umulh	x22, x10, x19
+	eor	x14, x14, x27
+	umulh	x23, x11, x19
+	adcs	x13, x13, xzr
+	umulh	x24, x12, x19
+	adc	x14, x14, xzr
+
+	umulh	x25, x13, x19
+	and	x28, x19, x27
+	umulh	x27, x14, x19
+	neg	x28, x28
+	mul	x9, x9, x19
+	mul	x10, x10, x19
+	mul	x11, x11, x19
+	adds	x10, x10, x21
+	mul	x12, x12, x19
+	adcs	x11, x11, x22
+	mul	x13, x13, x19
+	adcs	x12, x12, x23
+	mul	x14, x14, x19
+	adcs	x13, x13, x24
+	adcs	x14, x14 ,x25
+	adc	x27, x27, x28
+	adds	x3, x3, x9
+	adcs	x4, x4, x10
+	adcs	x5, x5, x11
+	adcs	x6, x6, x12
+	adcs	x7, x7, x13
+	adcs	x8, x8, x14
+	adc	x9, x26,   x27
+
+	extr	x3, x4, x3, #30
+	extr	x4, x5, x4, #30
+	extr	x5, x6, x5, #30
+	asr	x27, x9, #63
+	extr	x6, x7, x6, #30
+	extr	x7, x8, x7, #30
+	extr	x8, x9, x8, #30
+
+	eor	x3, x3, x27
+	eor	x4, x4, x27
+	adds	x3, x3, x27, lsr#63
+	eor	x5, x5, x27
+	adcs	x4, x4, xzr
+	eor	x6, x6, x27
+	adcs	x5, x5, xzr
+	eor	x7, x7, x27
+	adcs	x6, x6, xzr
+	eor	x8, x8, x27
+	stp	x3, x4, [x1,#8*0]
+	adcs	x7, x7, xzr
+	stp	x5, x6, [x1,#8*2]
+	adc	x8, x8, xzr
+	stp	x7, x8, [x1,#8*4]
+
+	ret
+	ENDP
+
+	ALIGN	16
+|__ab_approximation_30| PROC
+	ldp	x13, x14, [x0,#8*4]	// |a| is still in registers
+	ldp	x11, x12, [x0,#8*2]
+
+	orr	x21, x8, x14	// check top-most limbs, ...
+	cmp	x21, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x6
+	orr	x21, x8, x14	// ... ones before top-most, ...
+	cselne	x13,x13,x12
+
+	cmp	x21, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x5
+	orr	x21, x8, x14	// ... and ones before that ...
+	cselne	x13,x13,x11
+
+	cmp	x21, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x4
+	orr	x21, x8, x14	// and one more, ...
+	cselne	x13,x13,x10
+
+	cmp	x21, #0
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	cselne	x7,x7,x3
+	orr	x21, x8, x14
+	cselne	x13,x13,x9
+
+	clz	x21, x21
+	cmp	x21, #64
+	cselne	x21,x21,xzr
+	cselne	x8,x8,x7
+	cselne	x14,x14,x13
+	neg	x22, x21
+
+	lslv	x8, x8, x21	// align high limbs to the left
+	lslv	x14, x14, x21
+	lsrv	x7, x7, x22
+	lsrv	x13, x13, x22
+	and	x7, x7, x22, asr#6
+	and	x13, x13, x22, asr#6
+	orr	x8, x8, x7
+	orr	x14, x14, x13
+
+	bfxil	x8, x3, #0, #32
+	bfxil	x14, x9, #0, #32
+
+	b	__inner_loop_30
+	ret
+	ENDP
+
+
+	ALIGN	16
+|__inner_loop_30| PROC
+	mov	x28, #30
+	mov	x17, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
+	mov	x20, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
+	mov	x27,#0x7FFFFFFF7FFFFFFF
+
+|$Loop_30|
+	sbfx	x24, x8, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x8, x14
+	sub	x28, x28, #1
+	and	x21, x14, x24
+
+	sub	x22, x14, x8		// |b_|-|a_|
+	subs	x23, x8, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1	// L + (a_ & b_) >> 1
+	mov	x21, x20
+	cselhs	x14,x14,x8
+	cselhs	x8,x23,x22
+	cselhs	x20,x20,x17
+	cselhs	x17,x17,x21
+	cselhs	x2,x2,x25
+	lsr	x8, x8, #1
+	and	x21, x20, x24
+	and	x22, x27, x24
+	add	x23, x14, #2
+	sub	x17, x17, x21	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
+	add	x20, x20, x20	// |f1|<<=1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+	add	x17, x17, x22
+	sub	x20, x20, x27
+
+	cbnz	x28, |$Loop_30|
+
+	mov	x27, #0x7FFFFFFF
+	ubfx	x16, x17, #0, #32
+	ubfx	x17, x17, #32, #32
+	ubfx	x19, x20, #0, #32
+	ubfx	x20, x20, #32, #32
+	sub	x16, x16, x27		// remove the bias
+	sub	x17, x17, x27
+	sub	x19, x19, x27
+	sub	x20, x20, x27
+
+	ret
+	ENDP
+
+	ALIGN	16
+|__inner_loop_48| PROC
+|$Loop_48|
+	sbfx	x24, x3, #0, #1	// if |a_| is odd, then we'll be subtracting
+	and	x25, x3, x9
+	sub	x15, x15, #1
+	and	x21, x9, x24
+	sub	x22, x9, x3		// |b_|-|a_|
+	subs	x23, x3, x21	// |a_|-|b_| (or |a_|-0 if |a_| was even)
+	add	x25, x2, x25, lsr#1
+	cselhs	x9,x9,x3
+	cselhs	x3,x23,x22
+	cselhs	x2,x2,x25
+	add	x23, x9, #2
+	lsr	x3, x3, #1
+	add	x2, x2, x23, lsr#2	// "negate" |L| if |b|%8 is 3 or 5
+
+	cbnz	x15, |$Loop_48|
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
new file mode 100644
index 00000000000..38de6fc1229
--- /dev/null
+++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
@@ -0,0 +1,509 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	ct_is_square_mod_384
+
+ALIGN	32
+ct_is_square_mod_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_ct_is_square_mod_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,536
+
+$L$SEH_body_ct_is_square_mod_384::
+
+
+	lea	rax,QWORD PTR[((24+255))+rsp]
+	and	rax,-256
+
+	mov	r8,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	mov	r10,QWORD PTR[16+rdi]
+	mov	r11,QWORD PTR[24+rdi]
+	mov	r12,QWORD PTR[32+rdi]
+	mov	r13,QWORD PTR[40+rdi]
+
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rbx,QWORD PTR[16+rsi]
+	mov	rcx,QWORD PTR[24+rsi]
+	mov	rdx,QWORD PTR[32+rsi]
+	mov	rdi,QWORD PTR[40+rsi]
+	mov	rsi,rax
+
+	mov	QWORD PTR[rax],r8
+	mov	QWORD PTR[8+rax],r9
+	mov	QWORD PTR[16+rax],r10
+	mov	QWORD PTR[24+rax],r11
+	mov	QWORD PTR[32+rax],r12
+	mov	QWORD PTR[40+rax],r13
+
+	mov	QWORD PTR[48+rax],r14
+	mov	QWORD PTR[56+rax],r15
+	mov	QWORD PTR[64+rax],rbx
+	mov	QWORD PTR[72+rax],rcx
+	mov	QWORD PTR[80+rax],rdx
+	mov	QWORD PTR[88+rax],rdi
+
+	xor	rbp,rbp
+	mov	ecx,24
+	jmp	$L$oop_is_square
+
+ALIGN	32
+$L$oop_is_square::
+	mov	DWORD PTR[16+rsp],ecx
+
+	call	__ab_approximation_30
+	mov	QWORD PTR[rsp],rax
+	mov	QWORD PTR[8+rsp],rbx
+
+	mov	rdi,128+8*6
+	xor	rdi,rsi
+	call	__smulq_384_n_shift_by_30
+
+	mov	rdx,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__smulq_384_n_shift_by_30
+
+	mov	ecx,DWORD PTR[16+rsp]
+	xor	rsi,128
+
+	and	r14,QWORD PTR[48+rdi]
+	shr	r14,1
+	add	rbp,r14
+
+	sub	ecx,1
+	jnz	$L$oop_is_square
+
+
+
+
+	mov	r9,QWORD PTR[48+rsi]
+	call	__inner_loop_48
+
+	mov	rax,1
+	and	rax,rbp
+	xor	rax,1
+
+	lea	r8,QWORD PTR[536+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_ct_is_square_mod_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ct_is_square_mod_384::
+ct_is_square_mod_384	ENDP
+
+
+ALIGN	32
+__smulq_384_n_shift_by_30	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbx,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbx,rdx
+	add	rbx,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mov	r14,rdx
+	and	r14,rbx
+	mul	rbx
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbx
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbx
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	neg	r14
+	mul	rbx
+	add	r13,rax
+	adc	r14,rdx
+	lea	rsi,QWORD PTR[48+rsi]
+	mov	rdx,rcx
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbx,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbx,rdx
+	add	rbx,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mov	r15,rdx
+	and	r15,rbx
+	mul	rbx
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbx
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbx
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbx
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbx
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	neg	r15
+	mul	rbx
+	add	r13,rax
+	adc	r15,rdx
+	lea	rsi,QWORD PTR[((-48))+rsi]
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,QWORD PTR[32+rdi]
+	adc	r13,QWORD PTR[40+rdi]
+	adc	r14,r15
+
+	shrd	r8,r9,30
+	shrd	r9,r10,30
+	shrd	r10,r11,30
+	shrd	r11,r12,30
+	shrd	r12,r13,30
+	shrd	r13,r14,30
+
+	sar	r14,63
+	xor	rbx,rbx
+	sub	rbx,r14
+
+	xor	r8,r14
+	xor	r9,r14
+	xor	r10,r14
+	xor	r11,r14
+	xor	r12,r14
+	xor	r13,r14
+	add	r8,rbx
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__smulq_384_n_shift_by_30	ENDP
+
+ALIGN	32
+__ab_approximation_30	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbx,QWORD PTR[88+rsi]
+	mov	r15,QWORD PTR[80+rsi]
+	mov	r14,QWORD PTR[72+rsi]
+
+	mov	rax,r13
+	or	rax,rbx
+	cmovz	r13,r12
+	cmovz	rbx,r15
+	cmovz	r12,r11
+	mov	r11,QWORD PTR[64+rsi]
+	cmovz	r15,r14
+
+	mov	rax,r13
+	or	rax,rbx
+	cmovz	r13,r12
+	cmovz	rbx,r15
+	cmovz	r12,r10
+	mov	r10,QWORD PTR[56+rsi]
+	cmovz	r15,r11
+
+	mov	rax,r13
+	or	rax,rbx
+	cmovz	r13,r12
+	cmovz	rbx,r15
+	cmovz	r12,r9
+	mov	r9,QWORD PTR[48+rsi]
+	cmovz	r15,r10
+
+	mov	rax,r13
+	or	rax,rbx
+	cmovz	r13,r12
+	cmovz	rbx,r15
+	cmovz	r12,r8
+	cmovz	r15,r9
+
+	mov	rax,r13
+	or	rax,rbx
+	bsr	rcx,rax
+	lea	rcx,QWORD PTR[1+rcx]
+	cmovz	r13,r8
+	cmovz	rbx,r9
+	cmovz	rcx,rax
+	neg	rcx
+
+
+	shld	r13,r12,cl
+	shld	rbx,r15,cl
+
+	mov	rax,0FFFFFFFF00000000h
+	mov	r8d,r8d
+	mov	r9d,r9d
+	and	r13,rax
+	and	rbx,rax
+	or	r8,r13
+	or	r9,rbx
+
+	jmp	__inner_loop_30
+
+	DB	0F3h,0C3h		;repret
+__ab_approximation_30	ENDP
+
+ALIGN	32
+__inner_loop_30	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbx,07FFFFFFF80000000h
+	mov	rcx,0800000007FFFFFFFh
+	lea	r15,QWORD PTR[((-1))+rbx]
+	mov	edi,30
+
+$L$oop_30::
+	mov	rax,r8
+	and	rax,r9
+	shr	rax,1
+
+	cmp	r8,r9
+	mov	r10,r8
+	mov	r11,r9
+	lea	rax,QWORD PTR[rbp*1+rax]
+	mov	r12,rbx
+	mov	r13,rcx
+	mov	r14,rbp
+	cmovb	r8,r9
+	cmovb	r9,r10
+	cmovb	rbx,rcx
+	cmovb	rcx,r12
+	cmovb	rbp,rax
+
+	sub	r8,r9
+	sub	rbx,rcx
+	add	rbx,r15
+
+	test	r10,1
+	cmovz	r8,r10
+	cmovz	r9,r11
+	cmovz	rbx,r12
+	cmovz	rcx,r13
+	cmovz	rbp,r14
+
+	lea	rax,QWORD PTR[2+r9]
+	shr	r8,1
+	shr	rax,2
+	add	rcx,rcx
+	lea	rbp,QWORD PTR[rbp*1+rax]
+	sub	rcx,r15
+
+	sub	edi,1
+	jnz	$L$oop_30
+
+	shr	r15,32
+	mov	eax,ebx
+	shr	rbx,32
+	mov	edx,ecx
+	shr	rcx,32
+	sub	rax,r15
+	sub	rbx,r15
+	sub	rdx,r15
+	sub	rcx,r15
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_30	ENDP
+
+
+ALIGN	32
+__inner_loop_48	PROC PRIVATE
+	DB	243,15,30,250
+	mov	edi,48
+
+$L$oop_48::
+	mov	rax,r8
+	and	rax,r9
+	shr	rax,1
+
+	cmp	r8,r9
+	mov	r10,r8
+	mov	r11,r9
+	lea	rax,QWORD PTR[rbp*1+rax]
+	mov	r12,rbp
+	cmovb	r8,r9
+	cmovb	r9,r10
+	cmovb	rbp,rax
+
+	sub	r8,r9
+
+	test	r10,1
+	cmovz	r8,r10
+	cmovz	r9,r11
+	cmovz	rbp,r12
+
+	lea	rax,QWORD PTR[2+r9]
+	shr	r8,1
+	shr	rax,2
+	add	rbp,rax
+
+	sub	edi,1
+	jnz	$L$oop_48
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_48	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_ct_is_square_mod_384
+	DD	imagerel $L$SEH_body_ct_is_square_mod_384
+	DD	imagerel $L$SEH_info_ct_is_square_mod_384_prologue
+
+	DD	imagerel $L$SEH_body_ct_is_square_mod_384
+	DD	imagerel $L$SEH_epilogue_ct_is_square_mod_384
+	DD	imagerel $L$SEH_info_ct_is_square_mod_384_body
+
+	DD	imagerel $L$SEH_epilogue_ct_is_square_mod_384
+	DD	imagerel $L$SEH_end_ct_is_square_mod_384
+	DD	imagerel $L$SEH_info_ct_is_square_mod_384_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_ct_is_square_mod_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_ct_is_square_mod_384_body::
+DB	1,0,18,0
+DB	000h,0f4h,043h,000h
+DB	000h,0e4h,044h,000h
+DB	000h,0d4h,045h,000h
+DB	000h,0c4h,046h,000h
+DB	000h,034h,047h,000h
+DB	000h,054h,048h,000h
+DB	000h,074h,04ah,000h
+DB	000h,064h,04bh,000h
+DB	000h,001h,049h,000h
+$L$SEH_info_ct_is_square_mod_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
new file mode 100644
index 00000000000..de79f8ec80e
--- /dev/null
+++ b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
@@ -0,0 +1,1224 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	ct_inverse_mod_383
+
+ALIGN	32
+ct_inverse_mod_383	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_ct_inverse_mod_383::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,1112
+
+$L$SEH_body_ct_inverse_mod_383::
+
+
+	lea	rax,QWORD PTR[((88+511))+rsp]
+	and	rax,-512
+	mov	QWORD PTR[32+rsp],rdi
+	mov	QWORD PTR[40+rsp],rcx
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[8+rdx]
+	mov	rbx,QWORD PTR[16+rdx]
+	mov	rbp,QWORD PTR[24+rdx]
+	mov	rsi,QWORD PTR[32+rdx]
+	mov	rdi,QWORD PTR[40+rdx]
+
+	mov	QWORD PTR[rax],r8
+	mov	QWORD PTR[8+rax],r9
+	mov	QWORD PTR[16+rax],r10
+	mov	QWORD PTR[24+rax],r11
+	mov	QWORD PTR[32+rax],r12
+	mov	QWORD PTR[40+rax],r13
+
+	mov	QWORD PTR[48+rax],r14
+	mov	QWORD PTR[56+rax],r15
+	mov	QWORD PTR[64+rax],rbx
+	mov	QWORD PTR[72+rax],rbp
+	mov	QWORD PTR[80+rax],rsi
+	mov	rsi,rax
+	mov	QWORD PTR[88+rax],rdi
+
+
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+
+
+	mov	QWORD PTR[96+rdi],rdx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+
+
+	mov	QWORD PTR[96+rdi],rdx
+
+
+	xor	rsi,256
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+
+
+
+	mov	rax,QWORD PTR[96+rsi]
+	mov	r11,QWORD PTR[144+rsi]
+	mov	rbx,rdx
+	mov	r10,rax
+	imul	QWORD PTR[56+rsp]
+	mov	r8,rax
+	mov	rax,r11
+	mov	r9,rdx
+	imul	QWORD PTR[64+rsp]
+	add	r8,rax
+	adc	r9,rdx
+	mov	QWORD PTR[48+rdi],r8
+	mov	QWORD PTR[56+rdi],r9
+	sar	r9,63
+	mov	QWORD PTR[64+rdi],r9
+	mov	QWORD PTR[72+rdi],r9
+	mov	QWORD PTR[80+rdi],r9
+	mov	QWORD PTR[88+rdi],r9
+	lea	rsi,QWORD PTR[96+rsi]
+
+	mov	rax,r10
+	imul	rbx
+	mov	r8,rax
+	mov	rax,r11
+	mov	r9,rdx
+	imul	rcx
+	add	r8,rax
+	adc	r9,rdx
+	mov	QWORD PTR[96+rdi],r8
+	mov	QWORD PTR[104+rdi],r9
+	sar	r9,63
+	mov	QWORD PTR[112+rdi],r9
+	mov	QWORD PTR[120+rdi],r9
+	mov	QWORD PTR[128+rdi],r9
+	mov	QWORD PTR[136+rdi],r9
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+	sar	r13,63
+	mov	QWORD PTR[48+rdi],r13
+	mov	QWORD PTR[56+rdi],r13
+	mov	QWORD PTR[64+rdi],r13
+	mov	QWORD PTR[72+rdi],r13
+	mov	QWORD PTR[80+rdi],r13
+	mov	QWORD PTR[88+rdi],r13
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+	xor	rsi,256+8*12
+	mov	edi,62
+	call	__ab_approximation_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383_n_shift_by_62
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+
+	xor	rsi,256+8*12
+	mov	edi,62
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[48+rsi]
+	mov	r11,QWORD PTR[56+rsi]
+	call	__inner_loop_62
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[48+rdi],r10
+
+
+
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[96+rdi]
+	call	__smulq_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulq_767x63
+
+
+	xor	rsi,256+8*12
+	mov	edi,22
+
+	mov	r8,QWORD PTR[rsi]
+	xor	r9,r9
+	mov	r10,QWORD PTR[48+rsi]
+	xor	r11,r11
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	lea	rsi,QWORD PTR[96+rsi]
+
+
+
+
+
+	mov	rdx,r12
+	mov	rcx,r13
+	mov	rdi,QWORD PTR[32+rsp]
+	call	__smulq_767x63
+
+	mov	rsi,QWORD PTR[40+rsp]
+	mov	rdx,rax
+	sar	rax,63
+
+	mov	r8,rax
+	mov	r9,rax
+	mov	r10,rax
+	and	r8,QWORD PTR[rsi]
+	and	r9,QWORD PTR[8+rsi]
+	mov	r11,rax
+	and	r10,QWORD PTR[16+rsi]
+	and	r11,QWORD PTR[24+rsi]
+	mov	r12,rax
+	and	r12,QWORD PTR[32+rsi]
+	and	rax,QWORD PTR[40+rsi]
+
+	add	r14,r8
+	adc	r15,r9
+	adc	rbx,r10
+	adc	rbp,r11
+	adc	rcx,r12
+	adc	rdx,rax
+
+	mov	QWORD PTR[48+rdi],r14
+	mov	QWORD PTR[56+rdi],r15
+	mov	QWORD PTR[64+rdi],rbx
+	mov	QWORD PTR[72+rdi],rbp
+	mov	QWORD PTR[80+rdi],rcx
+	mov	QWORD PTR[88+rdi],rdx
+
+	lea	r8,QWORD PTR[1112+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_ct_inverse_mod_383::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ct_inverse_mod_383::
+ct_inverse_mod_383	ENDP
+
+ALIGN	32
+__smulq_767x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbp,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	mov	QWORD PTR[8+rsp],rdi
+	mov	QWORD PTR[16+rsp],rsi
+	lea	rsi,QWORD PTR[48+rsi]
+
+	xor	rbp,rdx
+	add	rbp,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mul	rbp
+	mov	QWORD PTR[rdi],rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbp
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mov	QWORD PTR[8+rdi],r9
+	mul	rbp
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mov	QWORD PTR[16+rdi],r10
+	mul	rbp
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	QWORD PTR[24+rdi],r11
+	mul	rbp
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	mov	QWORD PTR[32+rdi],r12
+	imul	rbp
+	add	r13,rax
+	adc	rdx,0
+
+	mov	QWORD PTR[40+rdi],r13
+	mov	QWORD PTR[48+rdi],rdx
+	sar	rdx,63
+	mov	QWORD PTR[56+rdi],rdx
+	mov	rdx,rcx
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+	mov	r15,QWORD PTR[56+rsi]
+	mov	rbx,QWORD PTR[64+rsi]
+	mov	rbp,QWORD PTR[72+rsi]
+	mov	rcx,QWORD PTR[80+rsi]
+	mov	rdi,QWORD PTR[88+rsi]
+
+	mov	rsi,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rsi,rdx
+	add	rsi,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	xor	r14,rdx
+	xor	r15,rdx
+	xor	rbx,rdx
+	xor	rbp,rdx
+	xor	rcx,rdx
+	xor	rdi,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rbx,0
+	adc	rbp,0
+	adc	rcx,0
+	adc	rdi,0
+
+	mul	rsi
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rsi
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rsi
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rsi
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rsi
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	mul	rsi
+	add	r13,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r14,rdx
+	mul	rsi
+	add	r14,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r15,rdx
+	mul	rsi
+	add	r15,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	rbx,rdx
+	mul	rsi
+	add	rbx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+	mul	rsi
+	add	rbp,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	rcx,rdx
+	mul	rsi
+	add	rcx,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	rdi,rdx
+	mov	rdx,QWORD PTR[8+rsp]
+	imul	rax,rsi
+	mov	rsi,QWORD PTR[16+rsp]
+	add	rax,rdi
+
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	adc	r10,QWORD PTR[16+rdx]
+	adc	r11,QWORD PTR[24+rdx]
+	adc	r12,QWORD PTR[32+rdx]
+	adc	r13,QWORD PTR[40+rdx]
+	adc	r14,QWORD PTR[48+rdx]
+	mov	rdi,QWORD PTR[56+rdx]
+	adc	r15,rdi
+	adc	rbx,rdi
+	adc	rbp,rdi
+	adc	rcx,rdi
+	adc	rax,rdi
+
+	mov	rdi,rdx
+
+	mov	QWORD PTR[rdx],r8
+	mov	QWORD PTR[8+rdx],r9
+	mov	QWORD PTR[16+rdx],r10
+	mov	QWORD PTR[24+rdx],r11
+	mov	QWORD PTR[32+rdx],r12
+	mov	QWORD PTR[40+rdx],r13
+	mov	QWORD PTR[48+rdx],r14
+	mov	QWORD PTR[56+rdx],r15
+	mov	QWORD PTR[64+rdx],rbx
+	mov	QWORD PTR[72+rdx],rbp
+	mov	QWORD PTR[80+rdx],rcx
+	mov	QWORD PTR[88+rdx],rax
+
+	DB	0F3h,0C3h		;repret
+__smulq_767x63	ENDP
+
+ALIGN	32
+__smulq_383x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbp,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbp,rdx
+	add	rbp,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mul	rbp
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbp
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbp
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbp
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbp
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	imul	rax,rbp
+	add	r13,rax
+
+	lea	rsi,QWORD PTR[48+rsi]
+	mov	rdx,rcx
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbp,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbp,rdx
+	add	rbp,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mul	rbp
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbp
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbp
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbp
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbp
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	imul	rax,rbp
+	add	r13,rax
+
+	lea	rsi,QWORD PTR[((-48))+rsi]
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,QWORD PTR[32+rdi]
+	adc	r13,QWORD PTR[40+rdi]
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__smulq_383x63	ENDP
+
+ALIGN	32
+__smulq_383_n_shift_by_62	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbx,rdx
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbp,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbp,rdx
+	add	rbp,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mul	rbp
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbp
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbp
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbp
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbp
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	imul	rbp
+	add	r13,rax
+	adc	rdx,0
+
+	lea	rsi,QWORD PTR[48+rsi]
+	mov	r14,rdx
+	mov	rdx,rcx
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rbp,rdx
+	sar	rdx,63
+	xor	rax,rax
+	sub	rax,rdx
+
+	xor	rbp,rdx
+	add	rbp,rax
+
+	xor	r8,rdx
+	xor	r9,rdx
+	xor	r10,rdx
+	xor	r11,rdx
+	xor	r12,rdx
+	xor	r13,rdx
+	add	rax,r8
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mul	rbp
+	mov	r8,rax
+	mov	rax,r9
+	mov	r9,rdx
+	mul	rbp
+	add	r9,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r10,rdx
+	mul	rbp
+	add	r10,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r11,rdx
+	mul	rbp
+	add	r11,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mul	rbp
+	add	r12,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r13,rdx
+	imul	rbp
+	add	r13,rax
+	adc	rdx,0
+
+	lea	rsi,QWORD PTR[((-48))+rsi]
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,QWORD PTR[32+rdi]
+	adc	r13,QWORD PTR[40+rdi]
+	adc	r14,rdx
+	mov	rdx,rbx
+
+	shrd	r8,r9,62
+	shrd	r9,r10,62
+	shrd	r10,r11,62
+	shrd	r11,r12,62
+	shrd	r12,r13,62
+	shrd	r13,r14,62
+
+	sar	r14,63
+	xor	rbp,rbp
+	sub	rbp,r14
+
+	xor	r8,r14
+	xor	r9,r14
+	xor	r10,r14
+	xor	r11,r14
+	xor	r12,r14
+	xor	r13,r14
+	add	r8,rbp
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	xor	rdx,r14
+	xor	rcx,r14
+	add	rdx,rbp
+	add	rcx,rbp
+
+	DB	0F3h,0C3h		;repret
+__smulq_383_n_shift_by_62	ENDP
+
+ALIGN	32
+__ab_approximation_62	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r9,QWORD PTR[40+rsi]
+	mov	r11,QWORD PTR[88+rsi]
+	mov	rbx,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[80+rsi]
+	mov	r8,QWORD PTR[24+rsi]
+	mov	r10,QWORD PTR[72+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	cmovz	rbp,r10
+	mov	r8,QWORD PTR[16+rsi]
+	mov	r10,QWORD PTR[64+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	cmovz	rbp,r10
+	mov	r8,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[56+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	cmovz	rbp,r10
+	mov	r8,QWORD PTR[rsi]
+	mov	r10,QWORD PTR[48+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	bsr	rcx,rax
+	lea	rcx,QWORD PTR[1+rcx]
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rcx,rax
+	neg	rcx
+
+
+	shld	r9,rbx,cl
+	shld	r11,rbp,cl
+
+	jmp	__inner_loop_62
+
+	DB	0F3h,0C3h		;repret
+__ab_approximation_62	ENDP
+
+ALIGN	8
+	DD	0
+__inner_loop_62	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rdx,1
+	xor	rcx,rcx
+	xor	r12,r12
+	mov	r13,1
+	mov	QWORD PTR[8+rsp],rsi
+
+$L$oop_62::
+	xor	rax,rax
+	xor	rbx,rbx
+	test	r8,1
+	mov	rbp,r10
+	mov	r14,r11
+	cmovnz	rax,r10
+	cmovnz	rbx,r11
+	sub	rbp,r8
+	sbb	r14,r9
+	mov	r15,r8
+	mov	rsi,r9
+	sub	r8,rax
+	sbb	r9,rbx
+	cmovc	r8,rbp
+	cmovc	r9,r14
+	cmovc	r10,r15
+	cmovc	r11,rsi
+	mov	rax,rdx
+	cmovc	rdx,r12
+	cmovc	r12,rax
+	mov	rbx,rcx
+	cmovc	rcx,r13
+	cmovc	r13,rbx
+	xor	rax,rax
+	xor	rbx,rbx
+	shrd	r8,r9,1
+	shr	r9,1
+	test	r15,1
+	cmovnz	rax,r12
+	cmovnz	rbx,r13
+	add	r12,r12
+	add	r13,r13
+	sub	rdx,rax
+	sub	rcx,rbx
+	sub	edi,1
+	jnz	$L$oop_62
+
+	mov	rsi,QWORD PTR[8+rsp]
+	DB	0F3h,0C3h		;repret
+__inner_loop_62	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_ct_inverse_mod_383
+	DD	imagerel $L$SEH_body_ct_inverse_mod_383
+	DD	imagerel $L$SEH_info_ct_inverse_mod_383_prologue
+
+	DD	imagerel $L$SEH_body_ct_inverse_mod_383
+	DD	imagerel $L$SEH_epilogue_ct_inverse_mod_383
+	DD	imagerel $L$SEH_info_ct_inverse_mod_383_body
+
+	DD	imagerel $L$SEH_epilogue_ct_inverse_mod_383
+	DD	imagerel $L$SEH_end_ct_inverse_mod_383
+	DD	imagerel $L$SEH_info_ct_inverse_mod_383_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_ct_inverse_mod_383_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_ct_inverse_mod_383_body::
+DB	1,0,18,0
+DB	000h,0f4h,08bh,000h
+DB	000h,0e4h,08ch,000h
+DB	000h,0d4h,08dh,000h
+DB	000h,0c4h,08eh,000h
+DB	000h,034h,08fh,000h
+DB	000h,054h,090h,000h
+DB	000h,074h,092h,000h
+DB	000h,064h,093h,000h
+DB	000h,001h,091h,000h
+$L$SEH_info_ct_inverse_mod_383_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
new file mode 100644
index 00000000000..df4c46a4c44
--- /dev/null
+++ b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
@@ -0,0 +1,1597 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	ctx_inverse_mod_383
+
+ALIGN	32
+ctx_inverse_mod_383	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_ctx_inverse_mod_383::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,1112
+
+$L$SEH_body_ctx_inverse_mod_383::
+
+
+	lea	rax,QWORD PTR[((88+511))+rsp]
+	and	rax,-512
+	mov	QWORD PTR[32+rsp],rdi
+	mov	QWORD PTR[40+rsp],rcx
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[8+rdx]
+	mov	rbx,QWORD PTR[16+rdx]
+	mov	rbp,QWORD PTR[24+rdx]
+	mov	rsi,QWORD PTR[32+rdx]
+	mov	rdi,QWORD PTR[40+rdx]
+
+	mov	QWORD PTR[rax],r8
+	mov	QWORD PTR[8+rax],r9
+	mov	QWORD PTR[16+rax],r10
+	mov	QWORD PTR[24+rax],r11
+	mov	QWORD PTR[32+rax],r12
+	mov	QWORD PTR[40+rax],r13
+
+	mov	QWORD PTR[48+rax],r14
+	mov	QWORD PTR[56+rax],r15
+	mov	QWORD PTR[64+rax],rbx
+	mov	QWORD PTR[72+rax],rbp
+	mov	QWORD PTR[80+rax],rsi
+	mov	rsi,rax
+	mov	QWORD PTR[88+rax],rdi
+
+
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+
+
+	mov	QWORD PTR[96+rdi],rdx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+
+
+	mov	QWORD PTR[96+rdi],rdx
+
+
+	xor	rsi,256
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+
+
+
+	mov	rax,QWORD PTR[96+rsi]
+	mov	r11,QWORD PTR[144+rsi]
+	mov	rbx,rdx
+	mov	r10,rax
+	imul	QWORD PTR[56+rsp]
+	mov	r8,rax
+	mov	rax,r11
+	mov	r9,rdx
+	imul	QWORD PTR[64+rsp]
+	add	r8,rax
+	adc	r9,rdx
+	mov	QWORD PTR[48+rdi],r8
+	mov	QWORD PTR[56+rdi],r9
+	sar	r9,63
+	mov	QWORD PTR[64+rdi],r9
+	mov	QWORD PTR[72+rdi],r9
+	mov	QWORD PTR[80+rdi],r9
+	mov	QWORD PTR[88+rdi],r9
+	lea	rsi,QWORD PTR[96+rsi]
+
+	mov	rax,r10
+	imul	rbx
+	mov	r8,rax
+	mov	rax,r11
+	mov	r9,rdx
+	imul	rcx
+	add	r8,rax
+	adc	r9,rdx
+	mov	QWORD PTR[96+rdi],r8
+	mov	QWORD PTR[104+rdi],r9
+	sar	r9,63
+	mov	QWORD PTR[112+rdi],r9
+	mov	QWORD PTR[120+rdi],r9
+	mov	QWORD PTR[128+rdi],r9
+	mov	QWORD PTR[136+rdi],r9
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+	sar	r13,63
+	mov	QWORD PTR[48+rdi],r13
+	mov	QWORD PTR[56+rdi],r13
+	mov	QWORD PTR[64+rdi],r13
+	mov	QWORD PTR[72+rdi],r13
+	mov	QWORD PTR[80+rdi],r13
+	mov	QWORD PTR[88+rdi],r13
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+	xor	rsi,256+8*12
+	mov	edi,31
+	call	__ab_approximation_31
+
+
+	mov	QWORD PTR[72+rsp],r12
+	mov	QWORD PTR[80+rsp],r13
+
+	mov	rdi,256
+	xor	rdi,rsi
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[56+rsp],rdx
+	mov	QWORD PTR[64+rsp],rcx
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_191_n_shift_by_31
+	mov	QWORD PTR[72+rsp],rdx
+	mov	QWORD PTR[80+rsp],rcx
+
+	mov	rdx,QWORD PTR[56+rsp]
+	mov	rcx,QWORD PTR[64+rsp]
+	lea	rsi,QWORD PTR[96+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_383x63
+
+	mov	rdx,QWORD PTR[72+rsp]
+	mov	rcx,QWORD PTR[80+rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__smulx_767x63
+
+	xor	rsi,256+8*12
+	mov	edi,53
+
+	mov	r8,QWORD PTR[rsi]
+
+	mov	r10,QWORD PTR[48+rsi]
+
+	call	__inner_loop_62
+
+
+
+
+
+
+
+	lea	rsi,QWORD PTR[96+rsi]
+
+
+
+
+
+	mov	rdx,r12
+	mov	rcx,r13
+	mov	rdi,QWORD PTR[32+rsp]
+	call	__smulx_767x63
+
+	mov	rsi,QWORD PTR[40+rsp]
+	mov	rdx,rax
+	sar	rax,63
+
+	mov	r8,rax
+	mov	r9,rax
+	mov	r10,rax
+	and	r8,QWORD PTR[rsi]
+	and	r9,QWORD PTR[8+rsi]
+	mov	r11,rax
+	and	r10,QWORD PTR[16+rsi]
+	and	r11,QWORD PTR[24+rsi]
+	mov	r12,rax
+	and	r12,QWORD PTR[32+rsi]
+	and	rax,QWORD PTR[40+rsi]
+
+	add	r14,r8
+	adc	r15,r9
+	adc	rbx,r10
+	adc	rbp,r11
+	adc	rcx,r12
+	adc	rdx,rax
+
+	mov	QWORD PTR[48+rdi],r14
+	mov	QWORD PTR[56+rdi],r15
+	mov	QWORD PTR[64+rdi],rbx
+	mov	QWORD PTR[72+rdi],rbp
+	mov	QWORD PTR[80+rdi],rcx
+	mov	QWORD PTR[88+rdi],rdx
+
+	lea	r8,QWORD PTR[1112+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_ctx_inverse_mod_383::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_ctx_inverse_mod_383::
+ctx_inverse_mod_383	ENDP
+
+ALIGN	32
+__smulx_767x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	rax,rdx
+	sar	rax,63
+	xor	rbp,rbp
+	sub	rbp,rax
+
+	mov	QWORD PTR[8+rsp],rdi
+	mov	QWORD PTR[16+rsp],rsi
+	lea	rsi,QWORD PTR[48+rsi]
+
+	xor	rdx,rax
+	add	rdx,rbp
+
+	xor	r8,rax
+	xor	r9,rax
+	xor	r10,rax
+	xor	r11,rax
+	xor	r12,rax
+	xor	rax,r13
+	add	r8,rbp
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	rax,0
+
+	mulx	rbp,r8,r8
+	mulx	r13,r9,r9
+	add	r9,rbp
+	mulx	rbp,r10,r10
+	adc	r10,r13
+	mulx	r13,r11,r11
+	adc	r11,rbp
+	mulx	rbp,r12,r12
+	adc	r12,r13
+	adc	rbp,0
+	imul	rdx
+	add	rax,rbp
+	adc	rdx,0
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],rax
+	mov	QWORD PTR[48+rdi],rdx
+	sar	rdx,63
+	mov	QWORD PTR[56+rdi],rdx
+	mov	rdx,rcx
+	mov	rax,rcx
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+	mov	r15,QWORD PTR[56+rsi]
+	mov	rbx,QWORD PTR[64+rsi]
+	mov	rbp,QWORD PTR[72+rsi]
+	mov	rcx,QWORD PTR[80+rsi]
+	mov	rdi,QWORD PTR[88+rsi]
+
+	sar	rax,63
+	xor	rsi,rsi
+	sub	rsi,rax
+
+	xor	rdx,rax
+	add	rdx,rsi
+
+	xor	r8,rax
+	xor	r9,rax
+	xor	r10,rax
+	xor	r11,rax
+	xor	r12,rax
+	xor	r13,rax
+	xor	r14,rax
+	xor	r15,rax
+	xor	rbx,rax
+	xor	rbp,rax
+	xor	rcx,rax
+	xor	rdi,rax
+	add	r8,rsi
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+	adc	r14,0
+	adc	r15,0
+	adc	rbx,0
+	adc	rbp,0
+	adc	rcx,0
+	adc	rdi,0
+
+	mulx	rax,r8,r8
+	mulx	rsi,r9,r9
+	add	r9,rax
+	mulx	rax,r10,r10
+	adc	r10,rsi
+	mulx	rsi,r11,r11
+	adc	r11,rax
+	mulx	rax,r12,r12
+	adc	r12,rsi
+	mulx	rsi,r13,r13
+	adc	r13,rax
+	mulx	rax,r14,r14
+	adc	r14,rsi
+	mulx	rsi,r15,r15
+	adc	r15,rax
+	mulx	rax,rbx,rbx
+	adc	rbx,rsi
+	mulx	rsi,rbp,rbp
+	adc	rbp,rax
+	mulx	rax,rcx,rcx
+	adc	rcx,rsi
+	mulx	rsi,rdi,rdi
+	mov	rdx,QWORD PTR[8+rsp]
+	mov	rsi,QWORD PTR[16+rsp]
+	adc	rax,rdi
+
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	adc	r10,QWORD PTR[16+rdx]
+	adc	r11,QWORD PTR[24+rdx]
+	adc	r12,QWORD PTR[32+rdx]
+	adc	r13,QWORD PTR[40+rdx]
+	adc	r14,QWORD PTR[48+rdx]
+	mov	rdi,QWORD PTR[56+rdx]
+	adc	r15,rdi
+	adc	rbx,rdi
+	adc	rbp,rdi
+	adc	rcx,rdi
+	adc	rax,rdi
+
+	mov	rdi,rdx
+
+	mov	QWORD PTR[rdx],r8
+	mov	QWORD PTR[8+rdx],r9
+	mov	QWORD PTR[16+rdx],r10
+	mov	QWORD PTR[24+rdx],r11
+	mov	QWORD PTR[32+rdx],r12
+	mov	QWORD PTR[40+rdx],r13
+	mov	QWORD PTR[48+rdx],r14
+	mov	QWORD PTR[56+rdx],r15
+	mov	QWORD PTR[64+rdx],rbx
+	mov	QWORD PTR[72+rdx],rbp
+	mov	QWORD PTR[80+rdx],rcx
+	mov	QWORD PTR[88+rdx],rax
+
+	DB	0F3h,0C3h		;repret
+__smulx_767x63	ENDP
+
+ALIGN	32
+__smulx_383x63	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[((0+0))+rsi]
+	mov	r9,QWORD PTR[((0+8))+rsi]
+	mov	r10,QWORD PTR[((0+16))+rsi]
+	mov	r11,QWORD PTR[((0+24))+rsi]
+	mov	r12,QWORD PTR[((0+32))+rsi]
+	mov	r13,QWORD PTR[((0+40))+rsi]
+
+	mov	rbp,rdx
+	sar	rbp,63
+	xor	rax,rax
+	sub	rax,rbp
+
+	xor	rdx,rbp
+	add	rdx,rax
+
+	xor	r8,rbp
+	xor	r9,rbp
+	xor	r10,rbp
+	xor	r11,rbp
+	xor	r12,rbp
+	xor	r13,rbp
+	add	r8,rax
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mulx	rbp,r8,r8
+	mulx	rax,r9,r9
+	add	r9,rbp
+	mulx	rbp,r10,r10
+	adc	r10,rax
+	mulx	rax,r11,r11
+	adc	r11,rbp
+	mulx	rbp,r12,r12
+	adc	r12,rax
+	mulx	rax,r13,r13
+	mov	rdx,rcx
+	adc	r13,rbp
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+	mov	r8,QWORD PTR[((48+0))+rsi]
+	mov	r9,QWORD PTR[((48+8))+rsi]
+	mov	r10,QWORD PTR[((48+16))+rsi]
+	mov	r11,QWORD PTR[((48+24))+rsi]
+	mov	r12,QWORD PTR[((48+32))+rsi]
+	mov	r13,QWORD PTR[((48+40))+rsi]
+
+	mov	rbp,rdx
+	sar	rbp,63
+	xor	rax,rax
+	sub	rax,rbp
+
+	xor	rdx,rbp
+	add	rdx,rax
+
+	xor	r8,rbp
+	xor	r9,rbp
+	xor	r10,rbp
+	xor	r11,rbp
+	xor	r12,rbp
+	xor	r13,rbp
+	add	r8,rax
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	r13,0
+
+	mulx	rbp,r8,r8
+	mulx	rax,r9,r9
+	add	r9,rbp
+	mulx	rbp,r10,r10
+	adc	r10,rax
+	mulx	rax,r11,r11
+	adc	r11,rbp
+	mulx	rbp,r12,r12
+	adc	r12,rax
+	mulx	rax,r13,r13
+	adc	r13,rbp
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,QWORD PTR[32+rdi]
+	adc	r13,QWORD PTR[40+rdi]
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__smulx_383x63	ENDP
+
+ALIGN	32
+__smulx_383_n_shift_by_31	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbx,rdx
+	xor	r14,r14
+	mov	r8,QWORD PTR[((0+0))+rsi]
+	mov	r9,QWORD PTR[((0+8))+rsi]
+	mov	r10,QWORD PTR[((0+16))+rsi]
+	mov	r11,QWORD PTR[((0+24))+rsi]
+	mov	r12,QWORD PTR[((0+32))+rsi]
+	mov	r13,QWORD PTR[((0+40))+rsi]
+
+	mov	rax,rdx
+	sar	rax,63
+	xor	rbp,rbp
+	sub	rbp,rax
+
+	xor	rdx,rax
+	add	rdx,rbp
+
+	xor	r8,rax
+	xor	r9,rax
+	xor	r10,rax
+	xor	r11,rax
+	xor	r12,rax
+	xor	rax,r13
+	add	r8,rbp
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	rax,0
+
+	mulx	rbp,r8,r8
+	mulx	r13,r9,r9
+	add	r9,rbp
+	mulx	rbp,r10,r10
+	adc	r10,r13
+	mulx	r13,r11,r11
+	adc	r11,rbp
+	mulx	rbp,r12,r12
+	adc	r12,r13
+	adc	rbp,0
+	imul	rdx
+	add	rax,rbp
+	adc	r14,rdx
+
+	mov	rdx,rcx
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],rax
+	mov	r8,QWORD PTR[((48+0))+rsi]
+	mov	r9,QWORD PTR[((48+8))+rsi]
+	mov	r10,QWORD PTR[((48+16))+rsi]
+	mov	r11,QWORD PTR[((48+24))+rsi]
+	mov	r12,QWORD PTR[((48+32))+rsi]
+	mov	r13,QWORD PTR[((48+40))+rsi]
+
+	mov	rax,rdx
+	sar	rax,63
+	xor	rbp,rbp
+	sub	rbp,rax
+
+	xor	rdx,rax
+	add	rdx,rbp
+
+	xor	r8,rax
+	xor	r9,rax
+	xor	r10,rax
+	xor	r11,rax
+	xor	r12,rax
+	xor	rax,r13
+	add	r8,rbp
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	rax,0
+
+	mulx	rbp,r8,r8
+	mulx	r13,r9,r9
+	add	r9,rbp
+	mulx	rbp,r10,r10
+	adc	r10,r13
+	mulx	r13,r11,r11
+	adc	r11,rbp
+	mulx	rbp,r12,r12
+	adc	r12,r13
+	adc	rbp,0
+	imul	rdx
+	add	rax,rbp
+	adc	rdx,0
+
+	add	r8,QWORD PTR[rdi]
+	adc	r9,QWORD PTR[8+rdi]
+	adc	r10,QWORD PTR[16+rdi]
+	adc	r11,QWORD PTR[24+rdi]
+	adc	r12,QWORD PTR[32+rdi]
+	adc	rax,QWORD PTR[40+rdi]
+	adc	r14,rdx
+	mov	rdx,rbx
+
+	shrd	r8,r9,31
+	shrd	r9,r10,31
+	shrd	r10,r11,31
+	shrd	r11,r12,31
+	shrd	r12,rax,31
+	shrd	rax,r14,31
+
+	sar	r14,63
+	xor	rbp,rbp
+	sub	rbp,r14
+
+	xor	r8,r14
+	xor	r9,r14
+	xor	r10,r14
+	xor	r11,r14
+	xor	r12,r14
+	xor	rax,r14
+	add	r8,rbp
+	adc	r9,0
+	adc	r10,0
+	adc	r11,0
+	adc	r12,0
+	adc	rax,0
+
+	mov	QWORD PTR[rdi],r8
+	mov	QWORD PTR[8+rdi],r9
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],rax
+
+	xor	rdx,r14
+	xor	rcx,r14
+	add	rdx,rbp
+	add	rcx,rbp
+
+	DB	0F3h,0C3h		;repret
+__smulx_383_n_shift_by_31	ENDP
+
+ALIGN	32
+__smulx_191_n_shift_by_31	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbx,rdx
+	mov	r8,QWORD PTR[((0+0))+rsi]
+	mov	r9,QWORD PTR[((0+8))+rsi]
+	mov	r10,QWORD PTR[((0+16))+rsi]
+
+	mov	rax,rdx
+	sar	rax,63
+	xor	rbp,rbp
+	sub	rbp,rax
+
+	xor	rdx,rax
+	add	rdx,rbp
+
+	xor	r8,rax
+	xor	r9,rax
+	xor	rax,r10
+	add	r8,rbp
+	adc	r9,0
+	adc	rax,0
+
+	mulx	rbp,r8,r8
+	mulx	r10,r9,r9
+	add	r9,rbp
+	adc	r10,0
+	imul	rdx
+	add	r10,rax
+	adc	rdx,0
+	mov	r14,rdx
+	mov	rdx,rcx
+	mov	r11,QWORD PTR[((48+0))+rsi]
+	mov	r12,QWORD PTR[((48+8))+rsi]
+	mov	r13,QWORD PTR[((48+16))+rsi]
+
+	mov	rax,rdx
+	sar	rax,63
+	xor	rbp,rbp
+	sub	rbp,rax
+
+	xor	rdx,rax
+	add	rdx,rbp
+
+	xor	r11,rax
+	xor	r12,rax
+	xor	rax,r13
+	add	r11,rbp
+	adc	r12,0
+	adc	rax,0
+
+	mulx	rbp,r11,r11
+	mulx	r13,r12,r12
+	add	r12,rbp
+	adc	r13,0
+	imul	rdx
+	add	r13,rax
+	adc	rdx,0
+	add	r11,r8
+	adc	r12,r9
+	adc	r13,r10
+	adc	r14,rdx
+	mov	rdx,rbx
+
+	shrd	r11,r12,31
+	shrd	r12,r13,31
+	shrd	r13,r14,31
+
+	sar	r14,63
+	xor	rbp,rbp
+	sub	rbp,r14
+
+	xor	r11,r14
+	xor	r12,r14
+	xor	r13,r14
+	add	r11,rbp
+	adc	r12,0
+	adc	r13,0
+
+	mov	QWORD PTR[rdi],r11
+	mov	QWORD PTR[8+rdi],r12
+	mov	QWORD PTR[16+rdi],r13
+
+	xor	rdx,r14
+	xor	rcx,r14
+	add	rdx,rbp
+	add	rcx,rbp
+
+	DB	0F3h,0C3h		;repret
+__smulx_191_n_shift_by_31	ENDP
+
+ALIGN	32
+__ab_approximation_31	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r9,QWORD PTR[40+rsi]
+	mov	r11,QWORD PTR[88+rsi]
+	mov	rbx,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[80+rsi]
+	mov	r8,QWORD PTR[24+rsi]
+	mov	r10,QWORD PTR[72+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	mov	r8,QWORD PTR[16+rsi]
+	cmovz	rbp,r10
+	mov	r10,QWORD PTR[64+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	mov	r8,QWORD PTR[8+rsi]
+	cmovz	rbp,r10
+	mov	r10,QWORD PTR[56+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	mov	r8,QWORD PTR[rsi]
+	cmovz	rbp,r10
+	mov	r10,QWORD PTR[48+rsi]
+
+	mov	rax,r9
+	or	rax,r11
+	cmovz	r9,rbx
+	cmovz	r11,rbp
+	cmovz	rbx,r8
+	cmovz	rbp,r10
+
+	mov	rax,r9
+	or	rax,r11
+	bsr	rcx,rax
+	lea	rcx,QWORD PTR[1+rcx]
+	cmovz	r9,r8
+	cmovz	r11,r10
+	cmovz	rcx,rax
+	neg	rcx
+
+
+	shld	r9,rbx,cl
+	shld	r11,rbp,cl
+
+	mov	eax,07FFFFFFFh
+	and	r8,rax
+	and	r10,rax
+	andn	r9,rax,r9
+	andn	r11,rax,r11
+	or	r8,r9
+	or	r10,r11
+
+	jmp	__inner_loop_31
+
+	DB	0F3h,0C3h		;repret
+__ab_approximation_31	ENDP
+
+ALIGN	32
+__inner_loop_31	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rcx,07FFFFFFF80000000h
+	mov	r13,0800000007FFFFFFFh
+	mov	r15,07FFFFFFF7FFFFFFFh
+
+$L$oop_31::
+	cmp	r8,r10
+	mov	rax,r8
+	mov	rbx,r10
+	mov	rbp,rcx
+	mov	r14,r13
+	cmovb	r8,r10
+	cmovb	r10,rax
+	cmovb	rcx,r13
+	cmovb	r13,rbp
+
+	sub	r8,r10
+	sub	rcx,r13
+	add	rcx,r15
+
+	test	rax,1
+	cmovz	r8,rax
+	cmovz	r10,rbx
+	cmovz	rcx,rbp
+	cmovz	r13,r14
+
+	shr	r8,1
+	add	r13,r13
+	sub	r13,r15
+	sub	edi,1
+	jnz	$L$oop_31
+
+	shr	r15,32
+	mov	edx,ecx
+	mov	r12d,r13d
+	shr	rcx,32
+	shr	r13,32
+	sub	rdx,r15
+	sub	rcx,r15
+	sub	r12,r15
+	sub	r13,r15
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_31	ENDP
+
+
+ALIGN	32
+__inner_loop_62	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rdx,1
+	xor	rcx,rcx
+	xor	r12,r12
+	mov	r13,1
+
+$L$oop_62::
+	xor	rax,rax
+	test	r8,1
+	mov	rbx,r10
+	cmovnz	rax,r10
+	sub	rbx,r8
+	mov	rbp,r8
+	sub	r8,rax
+	cmovc	r8,rbx
+	cmovc	r10,rbp
+	mov	rax,rdx
+	cmovc	rdx,r12
+	cmovc	r12,rax
+	mov	rbx,rcx
+	cmovc	rcx,r13
+	cmovc	r13,rbx
+	xor	rax,rax
+	xor	rbx,rbx
+	shr	r8,1
+	test	rbp,1
+	cmovnz	rax,r12
+	cmovnz	rbx,r13
+	add	r12,r12
+	add	r13,r13
+	sub	rdx,rax
+	sub	rcx,rbx
+	sub	edi,1
+	jnz	$L$oop_62
+
+	DB	0F3h,0C3h		;repret
+__inner_loop_62	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_body_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_info_ctx_inverse_mod_383_prologue
+
+	DD	imagerel $L$SEH_body_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_epilogue_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_info_ctx_inverse_mod_383_body
+
+	DD	imagerel $L$SEH_epilogue_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_end_ctx_inverse_mod_383
+	DD	imagerel $L$SEH_info_ctx_inverse_mod_383_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_ctx_inverse_mod_383_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_ctx_inverse_mod_383_body::
+DB	1,0,18,0
+DB	000h,0f4h,08bh,000h
+DB	000h,0e4h,08ch,000h
+DB	000h,0d4h,08dh,000h
+DB	000h,0c4h,08eh,000h
+DB	000h,034h,08fh,000h
+DB	000h,054h,090h,000h
+DB	000h,074h,092h,000h
+DB	000h,064h,093h,000h
+DB	000h,001h,091h,000h
+$L$SEH_info_ctx_inverse_mod_383_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm
new file mode 100644
index 00000000000..7114ccf0c2e
--- /dev/null
+++ b/crypto/blst_src/build/win64/div3w-armv8.asm
@@ -0,0 +1,89 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+	EXPORT	|div_3_limbs|[FUNC]
+	ALIGN	32
+|div_3_limbs| PROC
+	ldp	x4,x5,[x0]	// load R
+	eor	x0,x0,x0	// Q = 0
+	mov	x3,#64		// loop counter
+	nop
+
+|$Loop|
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,x0	// Q <<= 1
+	sbcs	x7,x5,x2
+	add	x0,x0,#1	// Q + speculative bit
+	csello	x4,x4,x6
+	extr	x1,x2,x1,#1	// D >>= 1
+	csello	x5,x5,x7
+	lsr	x2,x2,#1
+	sbc	x0,x0,xzr	// subtract speculative bit
+	sub	x3,x3,#1
+	cbnz	x3,|$Loop|
+
+	asr	x3,x0,#63	// top bit -> mask
+	add	x0,x0,x0	// Q <<= 1
+	subs	x6,x4,x1	// R - D
+	add	x0,x0,#1	// Q + specilative bit
+	sbcs	x7,x5,x2
+	sbc	x0,x0,xzr	// subtract speculative bit
+
+	orr	x0,x0,x3	// all ones if overflow
+
+	ret
+	ENDP
+
+	EXPORT	|quot_rem_128|[FUNC]
+	ALIGN	32
+|quot_rem_128| PROC
+	ldp	x3,x4,[x1]
+
+	mul	x5,x3,x2	// divisor[0:1} * quotient
+	umulh	x6,x3,x2
+	mul	x11,  x4,x2
+	umulh	x7,x4,x2
+
+	ldp	x8,x9,[x0]	// load 3 limbs of the dividend
+	ldr	x10,[x0,#16]
+
+	adds	x6,x6,x11
+	adc	x7,x7,xzr
+
+	subs	x8,x8,x5	// dividend - divisor * quotient
+	sbcs	x9,x9,x6
+	sbcs	x10,x10,x7
+	sbc	x5,xzr,xzr		// borrow -> mask
+
+	add	x2,x2,x5	// if borrowed, adjust the quotient ...
+	and	x3,x3,x5
+	and	x4,x4,x5
+	adds	x8,x8,x3	// ... and add divisor
+	adc	x9,x9,x4
+
+	stp	x8,x9,[x0]	// save 2 limbs of the remainder
+	str	x2,[x0,#16]	// and one limb of the quotient
+
+	mov	x0,x2		// return adjusted quotient
+
+	ret
+	ENDP
+
+
+	EXPORT	|quot_rem_64|[FUNC]
+	ALIGN	32
+|quot_rem_64| PROC
+	ldr	x3,[x1]
+	ldr	x8,[x0]	// load 1 limb of the dividend
+
+	mul	x5,x3,x2	// divisor * quotient
+
+	sub	x8,x8,x5	// dividend - divisor * quotient
+
+	stp	x8,x2,[x0]	// save remainder and quotient
+
+	mov	x0,x2		// return quotient
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm
new file mode 100644
index 00000000000..c35f426f3d2
--- /dev/null
+++ b/crypto/blst_src/build/win64/div3w-x86_64.asm
@@ -0,0 +1,152 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	div_3_limbs
+
+
+ALIGN	32
+div_3_limbs	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_div_3_limbs::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+	mov	r8,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	xor	rax,rax
+	mov	ecx,64
+
+$L$oop::
+	mov	r10,r8
+	sub	r8,rsi
+	mov	r11,r9
+	sbb	r9,rdx
+	lea	rax,QWORD PTR[1+rax*1+rax]
+	mov	rdi,rdx
+	cmovc	r8,r10
+	cmovc	r9,r11
+	sbb	rax,0
+	shl	rdi,63
+	shr	rsi,1
+	shr	rdx,1
+	or	rsi,rdi
+	sub	ecx,1
+	jnz	$L$oop
+
+	lea	rcx,QWORD PTR[1+rax*1+rax]
+	sar	rax,63
+
+	sub	r8,rsi
+	sbb	r9,rdx
+	sbb	rcx,0
+
+	or	rax,rcx
+
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_div_3_limbs::
+div_3_limbs	ENDP
+PUBLIC	quot_rem_128
+
+
+ALIGN	32
+quot_rem_128	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_quot_rem_128::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+	mov	rax,rdx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[rsi]
+	mov	r8,rax
+	mov	rax,rcx
+	mov	r9,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r9,rax
+	adc	rdx,0
+
+	mov	r10,QWORD PTR[rdi]
+	mov	r11,QWORD PTR[8+rdi]
+	mov	rax,QWORD PTR[16+rdi]
+
+	sub	r10,r8
+	sbb	r11,r9
+	sbb	rax,rdx
+	sbb	r8,r8
+
+	add	rcx,r8
+	mov	r9,r8
+	and	r8,QWORD PTR[rsi]
+	and	r9,QWORD PTR[8+rsi]
+	add	r10,r8
+	adc	r11,r9
+
+	mov	QWORD PTR[rdi],r10
+	mov	QWORD PTR[8+rdi],r11
+	mov	QWORD PTR[16+rdi],rcx
+
+	mov	rax,rcx
+
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_quot_rem_128::
+quot_rem_128	ENDP
+
+
+
+
+
+PUBLIC	quot_rem_64
+
+
+ALIGN	32
+quot_rem_64	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_quot_rem_64::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+	mov	rax,rdx
+	imul	rdx,QWORD PTR[rsi]
+
+	mov	r10,QWORD PTR[rdi]
+
+	sub	r10,rdx
+
+	mov	QWORD PTR[rdi],r10
+	mov	QWORD PTR[8+rdi],rax
+
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_quot_rem_64::
+quot_rem_64	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/dll.c b/crypto/blst_src/build/win64/dll.c
new file mode 100644
index 00000000000..a70d0c98a23
--- /dev/null
+++ b/crypto/blst_src/build/win64/dll.c
@@ -0,0 +1,32 @@
+#include <windows.h>
+
+#if defined(_MSC_VER)
+/*
+ * Even though we don't have memcpy/memset anywhere, MSVC compiler
+ * generates calls to them as it recognizes corresponding patterns.
+ */
+void *memcpy(unsigned char *dst, const unsigned char *src, size_t n)
+{
+    void *ret = dst;
+
+    while(n--)
+        *dst++ = *src++;
+
+    return ret;
+}
+
+void *memset(unsigned char *dst, int c, size_t n)
+{
+    void *ret = dst;
+
+    while(n--)
+        *dst++ = (unsigned char)c;
+
+    return ret;
+}
+#elif defined(__GNUC__)
+# pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
+
+BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved)
+{   return TRUE;   }
diff --git a/crypto/blst_src/build/win64/mul_mont_256-armv8.asm b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm
new file mode 100644
index 00000000000..bb2dfe043c7
--- /dev/null
+++ b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm
@@ -0,0 +1,465 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+
+	EXPORT	|mul_mont_sparse_256|[FUNC]
+	ALIGN	32
+|mul_mont_sparse_256| PROC
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x10,x11,[x1]
+	ldr	x9,        [x2]
+	ldp	x12,x13,[x1,#16]
+
+	mul	x19,x10,x9
+	ldp	x5,x6,[x3]
+	mul	x20,x11,x9
+	ldp	x7,x8,[x3,#16]
+	mul	x21,x12,x9
+	mul	x22,x13,x9
+
+	umulh	x14,x10,x9
+	umulh	x15,x11,x9
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	umulh	x17,x13,x9
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,xzr,    x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*1]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*2]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	ldr	x9,[x2,8*3]
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	mul	x14,x10,x9
+	adcs	x20,x21,x15
+	mul	x15,x11,x9
+	adcs	x21,x22,x16
+	mul	x16,x12,x9
+	adcs	x22,x23,x17
+	mul	x17,x13,x9
+	adc	x23,xzr,xzr
+
+	adds	x19,x19,x14
+	umulh	x14,x10,x9
+	adcs	x20,x20,x15
+	umulh	x15,x11,x9
+	adcs	x21,x21,x16
+	mul	x3,x4,x19
+	umulh	x16,x12,x9
+	adcs	x22,x22,x17
+	umulh	x17,x13,x9
+	adc	x23,x23,xzr
+
+	adds	x20,x20,x14
+	//mul	x14,x5,x3
+	adcs	x21,x21,x15
+	mul	x15,x6,x3
+	adcs	x22,x22,x16
+	mul	x16,x7,x3
+	adc	x23,x23,x17
+	mul	x17,x8,x3
+	subs	xzr,x19,#1		//adds	x19,x19,x14
+	umulh	x14,x5,x3
+	adcs	x20,x20,x15
+	umulh	x15,x6,x3
+	adcs	x21,x21,x16
+	umulh	x16,x7,x3
+	adcs	x22,x22,x17
+	umulh	x17,x8,x3
+	adc	x23,x23,xzr
+
+	adds	x19,x20,x14
+	adcs	x20,x21,x15
+	adcs	x21,x22,x16
+	adcs	x22,x23,x17
+	adc	x23,xzr,xzr
+
+	subs	x14,x19,x5
+	sbcs	x15,x20,x6
+	sbcs	x16,x21,x7
+	sbcs	x17,x22,x8
+	sbcs	xzr,    x23,xzr
+
+	csello	x19,x19,x14
+	csello	x20,x20,x15
+	csello	x21,x21,x16
+	csello	x22,x22,x17
+
+	stp	x19,x20,[x0]
+	stp	x21,x22,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	ret
+	ENDP
+
+
+	EXPORT	|sqr_mont_sparse_256|[FUNC]
+	ALIGN	32
+|sqr_mont_sparse_256| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-48]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	mov	x4,x3
+
+	////////////////////////////////////////////////////////////////
+	//  |  |  |  |  |  |a1*a0|  |
+	//  |  |  |  |  |a2*a0|  |  |
+	//  |  |a3*a2|a3*a0|  |  |  |
+	//  |  |  |  |a2*a1|  |  |  |
+	//  |  |  |a3*a1|  |  |  |  |
+	// *|  |  |  |  |  |  |  | 2|
+	// +|a3*a3|a2*a2|a1*a1|a0*a0|
+	//  |--+--+--+--+--+--+--+--|
+	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10
+	//
+	//  "can't overflow" below mark carrying into high part of
+	//  multiplication result, which can't overflow, because it
+	//  can never be all ones.
+
+	mul	x11,x6,x5	// a[1]*a[0]
+	umulh	x15,x6,x5
+	mul	x12,x7,x5	// a[2]*a[0]
+	umulh	x16,x7,x5
+	mul	x13,x8,x5	// a[3]*a[0]
+	umulh	x19,x8,x5
+
+	adds	x12,x12,x15	// accumulate high parts of multiplication
+	mul	x14,x7,x6	// a[2]*a[1]
+	umulh	x15,x7,x6
+	adcs	x13,x13,x16
+	mul	x16,x8,x6	// a[3]*a[1]
+	umulh	x17,x8,x6
+	adc	x19,x19,xzr	// can't overflow
+
+	mul	x20,x8,x7	// a[3]*a[2]
+	umulh	x21,x8,x7
+
+	adds	x15,x15,x16	// accumulate high parts of multiplication
+	mul	x10,x5,x5	// a[0]*a[0]
+	adc	x16,x17,xzr	// can't overflow
+
+	adds	x13,x13,x14	// accumulate low parts of multiplication
+	umulh	x5,x5,x5
+	adcs	x19,x19,x15
+	mul	x15,x6,x6	// a[1]*a[1]
+	adcs	x20,x20,x16
+	umulh	x6,x6,x6
+	adc	x21,x21,xzr	// can't overflow
+
+	adds	x11,x11,x11	// acc[1-6]*=2
+	mul	x16,x7,x7	// a[2]*a[2]
+	adcs	x12,x12,x12
+	umulh	x7,x7,x7
+	adcs	x13,x13,x13
+	mul	x17,x8,x8	// a[3]*a[3]
+	adcs	x19,x19,x19
+	umulh	x8,x8,x8
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adc	x22,xzr,xzr
+
+	adds	x11,x11,x5	// +a[i]*a[i]
+	adcs	x12,x12,x15
+	adcs	x13,x13,x6
+	adcs	x19,x19,x16
+	adcs	x20,x20,x7
+	adcs	x21,x21,x17
+	adc	x22,x22,x8
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	adds	x10,x10,x19	// accumulate upper half
+	adcs	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adc	x19,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x19,xzr
+
+	csello	x10,x10,x14
+	csello	x11,x11,x15
+	csello	x12,x12,x16
+	csello	x13,x13,x17
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldr	x29,[sp],#48
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	EXPORT	|from_mont_256|[FUNC]
+	ALIGN	32
+|from_mont_256| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+
+	csello	x10,x10,x14
+	csello	x11,x11,x15
+	csello	x12,x12,x16
+	csello	x13,x13,x17
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|redc_mont_256|[FUNC]
+	ALIGN	32
+|redc_mont_256| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x4,x3
+	ldp	x10,x11,[x1]
+	ldp	x12,x13,[x1,#16]
+
+	bl	__mul_by_1_mont_256
+	ldr	x30,[x29,#8]
+
+	ldp	x14,x15,[x1,#32]
+	ldp	x16,x17,[x1,#48]
+
+	adds	x10,x10,x14
+	adcs	x11,x11,x15
+	adcs	x12,x12,x16
+	adcs	x13,x13,x17
+	adc	x9,xzr,xzr
+
+	subs	x14,x10,x5
+	sbcs	x15,x11,x6
+	sbcs	x16,x12,x7
+	sbcs	x17,x13,x8
+	sbcs	xzr,    x9,xzr
+
+	csello	x10,x10,x14
+	csello	x11,x11,x15
+	csello	x12,x12,x16
+	csello	x13,x13,x17
+
+	stp	x10,x11,[x0]
+	stp	x12,x13,[x0,#16]
+
+	ldr	x29,[sp],#16
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__mul_by_1_mont_256| PROC
+	mul	x3,x4,x10
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	mul	x3,x4,x10
+	adc	x13,x9,x17
+	//mul	x14,x5,x3
+	mul	x15,x6,x3
+	mul	x16,x7,x3
+	mul	x17,x8,x3
+	subs	xzr,x10,#1		//adds	x10,x10,x14
+	umulh	x14,x5,x3
+	adcs	x11,x11,x15
+	umulh	x15,x6,x3
+	adcs	x12,x12,x16
+	umulh	x16,x7,x3
+	adcs	x13,x13,x17
+	umulh	x17,x8,x3
+	adc	x9,xzr,xzr
+
+	adds	x10,x11,x14
+	adcs	x11,x12,x15
+	adcs	x12,x13,x16
+	adc	x13,x9,x17
+
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/mul_mont_384-armv8.asm b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm
new file mode 100644
index 00000000000..a309dfa4121
--- /dev/null
+++ b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm
@@ -0,0 +1,2373 @@
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+
+	EXPORT	|add_mod_384x384|[FUNC]
+	ALIGN	32
+|add_mod_384x384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__add_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__add_mod_384x384| PROC
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	adcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	adcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	adcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csello	x11,x11,x19
+	csello	x12,x12,x20
+	csello	x13,x13,x21
+	csello	x14,x14,x22
+	stp	x11,x12,[x0,#48]
+	csello	x15,x15,x23
+	stp	x13,x14,[x0,#64]
+	csello	x16,x16,x24
+	stp	x15,x16,[x0,#80]
+
+	ret
+	ENDP
+
+
+	EXPORT	|sub_mod_384x384|[FUNC]
+	ALIGN	32
+|sub_mod_384x384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__sub_mod_384x384| PROC
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	stp	x11,  x12,  [x0]
+	sbcs	x15,x15,x23
+	ldp	x11,  x12,  [x1,#48]
+	sbcs	x16,x16,x24
+
+	ldp	x19,x20,[x2,#48]
+	stp	x13,  x14,  [x0,#16]
+	ldp	x13,  x14,  [x1,#64]
+	ldp	x21,x22,[x2,#64]
+
+	sbcs	x11,x11,x19
+	stp	x15,  x16,  [x0,#32]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#80]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#80]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__add_mod_384| PROC
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	adds	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	adcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	adcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x17,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x17,xzr
+
+	csello	x11,x11,x19
+	csello	x12,x12,x20
+	csello	x13,x13,x21
+	csello	x14,x14,x22
+	csello	x15,x15,x23
+	stp	x11,x12,[x0]
+	csello	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__sub_mod_384| PROC
+	ldp	x11,  x12,  [x1]
+	ldp	x19,x20,[x2]
+	ldp	x13,  x14,  [x1,#16]
+	subs	x11,x11,x19
+	ldp	x21,x22,[x2,#16]
+	sbcs	x12,x12,x20
+	ldp	x15,  x16,  [x1,#32]
+	sbcs	x13,x13,x21
+	ldp	x23,x24,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x17,xzr,xzr
+
+	and	x19,x5,x17
+	and	x20,x6,x17
+	adds	x11,x11,x19
+	and	x21,x7,x17
+	adcs	x12,x12,x20
+	and	x22,x8,x17
+	adcs	x13,x13,x21
+	and	x23,x9,x17
+	adcs	x14,x14,x22
+	and	x24,x10,x17
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_mont_384x|[FUNC]
+	ALIGN	32
+|mul_mont_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#288		// space for 3 768-bit vectors
+
+	mov	x26,x0		// save r_ptr
+	mov	x27,x1		// save b_ptr
+	mov	x28,x2		// save b_ptr
+
+	sub	x0,sp,#0		// mul_384(t0, a->re, b->re)
+	bl	__mul_384
+
+	add	x1,x1,#48	// mul_384(t1, a->im, b->im)
+	add	x2,x2,#48
+	add	x0,sp,#96
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	sub	x2,x1,#48
+	add	x0,sp,#240
+	bl	__add_mod_384
+
+	add	x1,x28,#0
+	add	x2,x28,#48
+	add	x0,sp,#192		// t2
+	bl	__add_mod_384
+
+	add	x1,x0,#0
+	add	x2,x0,#48
+	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,x0
+	add	x2,sp,#0
+	bl	__sub_mod_384x384
+
+	add	x2,sp,#96
+	bl	__sub_mod_384x384	// t2 = t2-t0-t1
+
+	add	x1,sp,#0
+	add	x2,sp,#96
+	add	x0,sp,#0
+	bl	__sub_mod_384x384	// t0 = t0-t1
+
+	add	x1,sp,#0		// ret->re = redc(t0)
+	add	x0,x26,#0
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+
+	add	x1,sp,#192		// ret->im = redc(t2)
+	add	x0,x0,#48
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#288
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sqr_mont_384x|[FUNC]
+	ALIGN	32
+|sqr_mont_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 2 384-bit vectors
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	add	x0,sp,#0
+	bl	__add_mod_384		// t0 = a->re + a->im
+
+	add	x0,sp,#48
+	bl	__sub_mod_384		// t1 = a->re - a->im
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x11,x11,x11	// add with itself
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csello	x19,x11,x19
+	csello	x20,x12,x20
+	csello	x21,x13,x21
+	ldp	x11,x12,[sp]
+	csello	x22,x14,x22
+	ldr	x17,        [sp,#48]
+	csello	x23,x15,x23
+	ldp	x13,x14,[sp,#16]
+	csello	x24,x16,x24
+	ldp	x15,x16,[sp,#32]
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	add	x2,sp,#48
+	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_mont_384|[FUNC]
+	ALIGN	32
+|mul_mont_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__mul_mont_384| PROC
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	mov	x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*1]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*2]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*3]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*4]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	adc	x4,x17,xzr
+	ldr	x17,[x2,8*5]
+
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,x4,xzr
+	ldr	x4,[x29,#96]
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adcs	x25,x25,xzr
+	adc	x17,xzr,xzr
+
+	adds	x20,x20,x26
+	// mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adcs	x25,x25,x3
+	mul	x3,x10,x4
+	adc	x17,x17,xzr
+	subs	xzr,x19,#1		// adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adcs	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]	// pull r_ptr
+	adc	x17,x17,xzr
+
+	adds	x19,x20,x26
+	adcs	x20,x21,x27
+	adcs	x21,x22,x28
+	adcs	x22,x23,x0
+	adcs	x23,x24,x1
+	adcs	x24,x25,x3
+	adc	x25,x17,xzr
+
+	subs	x26,x19,x5
+	sbcs	x27,x20,x6
+	sbcs	x28,x21,x7
+	sbcs	x0,x22,x8
+	sbcs	x1,x23,x9
+	sbcs	x3,x24,x10
+	sbcs	xzr,    x25,xzr
+
+	csello	x11,x19,x26
+	csello	x12,x20,x27
+	csello	x13,x21,x28
+	csello	x14,x22,x0
+	csello	x15,x23,x1
+	csello	x16,x24,x3
+	ret
+	ENDP
+
+
+
+	EXPORT	|sqr_mont_384|[FUNC]
+	ALIGN	32
+|sqr_mont_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x4,x3		// adjust for missing b_ptr
+
+	mov	x3,x0		// save r_ptr
+	mov	x0,sp
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	mov	x1,sp
+	mov	x0,x3		// restore r_ptr
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sqr_n_mul_mont_383|[FUNC]
+	ALIGN	32
+|sqr_n_mul_mont_383| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x4,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#96		// space for 768-bit vector
+	mov	x17,x5			// save b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+	mov	x0,sp
+|$Loop_sqr_383|
+	bl	__sqr_384
+	sub	x2,x2,#1	// counter
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	mov	x1,sp
+	bl	__mul_by_1_mont_384
+
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// just accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	cbnz	x2,|$Loop_sqr_383|
+
+	mov	x2,x17
+	ldr	x17,[x17]
+	bl	__mul_mont_384
+	ldr	x30,[x29,#8]
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+	ALIGN	32
+|__sqr_384| PROC
+	mul	x19,x12,x11
+	mul	x20,x13,x11
+	mul	x21,x14,x11
+	mul	x22,x15,x11
+	mul	x23,x16,x11
+
+	umulh	x6,x12,x11
+	umulh	x7,x13,x11
+	umulh	x8,x14,x11
+	umulh	x9,x15,x11
+	adds	x20,x20,x6
+	umulh	x10,x16,x11
+	adcs	x21,x21,x7
+	mul	x7,x13,x12
+	adcs	x22,x22,x8
+	mul	x8,x14,x12
+	adcs	x23,x23,x9
+	mul	x9,x15,x12
+	adc	x24,xzr,    x10
+	mul	x10,x16,x12
+
+	adds	x21,x21,x7
+	umulh	x7,x13,x12
+	adcs	x22,x22,x8
+	umulh	x8,x14,x12
+	adcs	x23,x23,x9
+	umulh	x9,x15,x12
+	adcs	x24,x24,x10
+	umulh	x10,x16,x12
+	adc	x25,xzr,xzr
+
+	mul	x5,x11,x11
+	adds	x22,x22,x7
+	umulh	x11,  x11,x11
+	adcs	x23,x23,x8
+	mul	x8,x14,x13
+	adcs	x24,x24,x9
+	mul	x9,x15,x13
+	adc	x25,x25,x10
+	mul	x10,x16,x13
+
+	adds	x23,x23,x8
+	umulh	x8,x14,x13
+	adcs	x24,x24,x9
+	umulh	x9,x15,x13
+	adcs	x25,x25,x10
+	umulh	x10,x16,x13
+	adc	x26,xzr,xzr
+
+	mul	x6,x12,x12
+	adds	x24,x24,x8
+	umulh	x12,  x12,x12
+	adcs	x25,x25,x9
+	mul	x9,x15,x14
+	adc	x26,x26,x10
+	mul	x10,x16,x14
+
+	adds	x25,x25,x9
+	umulh	x9,x15,x14
+	adcs	x26,x26,x10
+	umulh	x10,x16,x14
+	adc	x27,xzr,xzr
+	mul	x7,x13,x13
+	adds	x26,x26,x9
+	umulh	x13,  x13,x13
+	adc	x27,x27,x10
+	mul	x8,x14,x14
+
+	mul	x10,x16,x15
+	umulh	x14,  x14,x14
+	adds	x27,x27,x10
+	umulh	x10,x16,x15
+	mul	x9,x15,x15
+	adc	x28,x10,xzr
+
+	adds	x19,x19,x19
+	adcs	x20,x20,x20
+	adcs	x21,x21,x21
+	adcs	x22,x22,x22
+	adcs	x23,x23,x23
+	adcs	x24,x24,x24
+	adcs	x25,x25,x25
+	adcs	x26,x26,x26
+	umulh	x15,  x15,x15
+	adcs	x27,x27,x27
+	mul	x10,x16,x16
+	adcs	x28,x28,x28
+	umulh	x16,  x16,x16
+	adc	x1,xzr,xzr
+
+	adds	x19,x19,x11
+	adcs	x20,x20,x6
+	adcs	x21,x21,x12
+	adcs	x22,x22,x7
+	adcs	x23,x23,x13
+	adcs	x24,x24,x8
+	adcs	x25,x25,x14
+	stp	x5,x19,[x0]
+	adcs	x26,x26,x9
+	stp	x20,x21,[x0,#16]
+	adcs	x27,x27,x15
+	stp	x22,x23,[x0,#32]
+	adcs	x28,x28,x10
+	stp	x24,x25,[x0,#48]
+	adc	x16,x16,x1
+	stp	x26,x27,[x0,#64]
+	stp	x28,x16,[x0,#80]
+
+	ret
+	ENDP
+
+
+	EXPORT	|sqr_384|[FUNC]
+	ALIGN	32
+|sqr_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	bl	__sqr_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|redc_mont_384|[FUNC]
+	ALIGN	32
+|redc_mont_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	bl	__redc_tail_mont_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|from_mont_384|[FUNC]
+	ALIGN	32
+|from_mont_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+
+	csello	x11,x11,x19
+	csello	x12,x12,x20
+	csello	x13,x13,x21
+	csello	x14,x14,x22
+	csello	x15,x15,x23
+	csello	x16,x16,x24
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__mul_by_1_mont_384| PROC
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	mul	x26,x4,x11
+	ldp	x15,x16,[x1,#32]
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	mul	x26,x4,x11
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	// mul	x19,x5,x26
+	mul	x20,x6,x26
+	mul	x21,x7,x26
+	mul	x22,x8,x26
+	mul	x23,x9,x26
+	mul	x24,x10,x26
+	subs	xzr,x11,#1		// adds	x19,x19,x11
+	umulh	x11,x5,x26
+	adcs	x20,x20,x12
+	umulh	x12,x6,x26
+	adcs	x21,x21,x13
+	umulh	x13,x7,x26
+	adcs	x22,x22,x14
+	umulh	x14,x8,x26
+	adcs	x23,x23,x15
+	umulh	x15,x9,x26
+	adcs	x24,x24,x16
+	umulh	x16,x10,x26
+	adc	x25,xzr,xzr
+	adds	x11,x11,x20
+	adcs	x12,x12,x21
+	adcs	x13,x13,x22
+	adcs	x14,x14,x23
+	adcs	x15,x15,x24
+	adc	x16,x16,x25
+
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__redc_tail_mont_384| PROC
+	ldp	x19,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x11,x11,x19	// accumulate upper half
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adcs	x16,x16,x24
+	adc	x25,xzr,xzr
+
+	subs	x19,x11,x5
+	sbcs	x20,x12,x6
+	sbcs	x21,x13,x7
+	sbcs	x22,x14,x8
+	sbcs	x23,x15,x9
+	sbcs	x24,x16,x10
+	sbcs	xzr,x25,xzr
+
+	csello	x11,x11,x19
+	csello	x12,x12,x20
+	csello	x13,x13,x21
+	csello	x14,x14,x22
+	csello	x15,x15,x23
+	csello	x16,x16,x24
+
+	stp	x11,x12,[x0]
+	stp	x13,x14,[x0,#16]
+	stp	x15,x16,[x0,#32]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_384|[FUNC]
+	ALIGN	32
+|mul_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__mul_384| PROC
+	ldp	x11,x12,[x1]
+	ldr	x17,        [x2]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+
+	umulh	x5,x11,x17
+	umulh	x6,x12,x17
+	umulh	x7,x13,x17
+	umulh	x8,x14,x17
+	umulh	x9,x15,x17
+	umulh	x10,x16,x17
+	ldr	x17,[x2,8*1]
+
+	str	x19,[x0]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,xzr,    x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(1+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*1]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(2+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*2]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(3+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*3]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	ldr	x17,[x2,#8*(4+1)]
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*4]
+	adds	x19,x20,x5
+	mul	x5,x11,x17
+	adcs	x20,x21,x6
+	mul	x6,x12,x17
+	adcs	x21,x22,x7
+	mul	x7,x13,x17
+	adcs	x22,x23,x8
+	mul	x8,x14,x17
+	adcs	x23,x24,x9
+	mul	x9,x15,x17
+	adc	x24,x25,x10
+	mul	x10,x16,x17
+	adds	x19,x19,x5
+	umulh	x5,x11,x17
+	adcs	x20,x20,x6
+	umulh	x6,x12,x17
+	adcs	x21,x21,x7
+	umulh	x7,x13,x17
+	adcs	x22,x22,x8
+	umulh	x8,x14,x17
+	adcs	x23,x23,x9
+	umulh	x9,x15,x17
+	adcs	x24,x24,x10
+	umulh	x10,x16,x17
+	adc	x25,xzr,xzr
+
+	str	x19,[x0,8*5]
+	adds	x19,x20,x5
+	adcs	x20,x21,x6
+	adcs	x21,x22,x7
+	adcs	x22,x23,x8
+	adcs	x23,x24,x9
+	adc	x24,x25,x10
+
+	stp	x19,x20,[x0,#48]
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|mul_382x|[FUNC]
+	ALIGN	32
+|mul_382x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#96		// space for two 384-bit vectors
+
+	ldp	x11,x12,[x1]
+	mov	x26,x0		// save r_ptr
+	ldp	x19,x20,[x1,#48]
+	mov	x27,x1		// save a_ptr
+	ldp	x13,x14,[x1,#16]
+	mov	x28,x2		// save b_ptr
+	ldp	x21,x22,[x1,#64]
+	ldp	x15,x16,[x1,#32]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x23,x24,[x1,#80]
+	adcs	x6,x12,x20
+	ldp	x11,x12,[x2]
+	adcs	x7,x13,x21
+	ldp	x19,x20,[x2,#48]
+	adcs	x8,x14,x22
+	ldp	x13,x14,[x2,#16]
+	adcs	x9,x15,x23
+	ldp	x21,x22,[x2,#64]
+	adc	x10,x16,x24
+	ldp	x15,x16,[x2,#32]
+
+	stp	x5,x6,[sp]
+	adds	x5,x11,x19	// t1 = b->re + b->im
+	ldp	x23,x24,[x2,#80]
+	adcs	x6,x12,x20
+	stp	x7,x8,[sp,#16]
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	stp	x9,x10,[sp,#32]
+	adcs	x9,x15,x23
+	stp	x5,x6,[sp,#48]
+	adc	x10,x16,x24
+	stp	x7,x8,[sp,#64]
+	stp	x9,x10,[sp,#80]
+
+	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
+
+	add	x1,sp,#0		// mul_384(ret->im, t0, t1)
+	add	x2,sp,#48
+	add	x0,x26,#96
+	bl	__mul_384
+
+	add	x1,x27,#48	// mul_384(tx, a->im, b->im)
+	add	x2,x28,#48
+	add	x0,sp,#0
+	bl	__mul_384
+
+	ldp	x5,x6,[x3]
+	ldp	x7,x8,[x3,#16]
+	ldp	x9,x10,[x3,#32]
+
+	add	x1,x26,#96	// ret->im -= tx
+	add	x2,sp,#0
+	add	x0,x26,#96
+	bl	__sub_mod_384x384
+
+	add	x2,x26,#0	// ret->im -= ret->re
+	bl	__sub_mod_384x384
+
+	add	x1,x26,#0	// ret->re -= tx
+	add	x2,sp,#0
+	add	x0,x26,#0
+	bl	__sub_mod_384x384
+	ldr	x30,[x29,#8]
+
+	add	sp,sp,#96
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sqr_382x|[FUNC]
+	ALIGN	32
+|sqr_382x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	x11,x12,[x1]
+	ldp	x19,x20,[x1,#48]
+	ldp	x13,x14,[x1,#16]
+	adds	x5,x11,x19	// t0 = a->re + a->im
+	ldp	x21,x22,[x1,#64]
+	adcs	x6,x12,x20
+	ldp	x15,x16,[x1,#32]
+	adcs	x7,x13,x21
+	ldp	x23,x24,[x1,#80]
+	adcs	x8,x14,x22
+	stp	x5,x6,[x0]
+	adcs	x9,x15,x23
+	ldp	x5,x6,[x2]
+	adc	x10,x16,x24
+	stp	x7,x8,[x0,#16]
+
+	subs	x11,x11,x19	// t1 = a->re - a->im
+	ldp	x7,x8,[x2,#16]
+	sbcs	x12,x12,x20
+	stp	x9,x10,[x0,#32]
+	sbcs	x13,x13,x21
+	ldp	x9,x10,[x2,#32]
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	adds	x11,x11,x19
+	and	x21,x7,x25
+	adcs	x12,x12,x20
+	and	x22,x8,x25
+	adcs	x13,x13,x21
+	and	x23,x9,x25
+	adcs	x14,x14,x22
+	and	x24,x10,x25
+	adcs	x15,x15,x23
+	stp	x11,x12,[x0,#48]
+	adc	x16,x16,x24
+	stp	x13,x14,[x0,#64]
+	stp	x15,x16,[x0,#80]
+
+	mov	x4,x1		// save a_ptr
+	add	x1,x0,#0	// mul_384(ret->re, t0, t1)
+	add	x2,x0,#48
+	bl	__mul_384
+
+	add	x1,x4,#0		// mul_384(ret->im, a->re, a->im)
+	add	x2,x4,#48
+	add	x0,x0,#96
+	bl	__mul_384
+	ldr	x30,[x29,#8]
+
+	ldp	x11,x12,[x0]
+	ldp	x13,x14,[x0,#16]
+	adds	x11,x11,x11	// add with itself
+	ldp	x15,x16,[x0,#32]
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adcs	x19,x19,x19
+	adcs	x20,x20,x20
+	stp	x11,x12,[x0]
+	adcs	x21,x21,x21
+	stp	x13,x14,[x0,#16]
+	adcs	x22,x22,x22
+	stp	x15,x16,[x0,#32]
+	adcs	x23,x23,x23
+	stp	x19,x20,[x0,#48]
+	adc	x24,x24,x24
+	stp	x21,x22,[x0,#64]
+	stp	x23,x24,[x0,#80]
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sqr_mont_382x|[FUNC]
+	ALIGN	32
+|sqr_mont_382x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x3,x0,[sp,#96]	// __mul_mont_384 wants them there
+	sub	sp,sp,#112		// space for two 384-bit vectors + word
+	mov	x4,x3		// adjust for missing b_ptr
+
+	ldp	x11,x12,[x1]
+	ldp	x13,x14,[x1,#16]
+	ldp	x15,x16,[x1,#32]
+
+	ldp	x17,x20,[x1,#48]
+	ldp	x21,x22,[x1,#64]
+	ldp	x23,x24,[x1,#80]
+
+	adds	x5,x11,x17	// t0 = a->re + a->im
+	adcs	x6,x12,x20
+	adcs	x7,x13,x21
+	adcs	x8,x14,x22
+	adcs	x9,x15,x23
+	adc	x10,x16,x24
+
+	subs	x19,x11,x17	// t1 = a->re - a->im
+	sbcs	x20,x12,x20
+	sbcs	x21,x13,x21
+	sbcs	x22,x14,x22
+	sbcs	x23,x15,x23
+	sbcs	x24,x16,x24
+	sbc	x25,xzr,xzr		// borrow flag as mask
+
+	stp	x5,x6,[sp]
+	stp	x7,x8,[sp,#16]
+	stp	x9,x10,[sp,#32]
+	stp	x19,x20,[sp,#48]
+	stp	x21,x22,[sp,#64]
+	stp	x23,x24,[sp,#80]
+	str	x25,[sp,#96]
+
+	ldp	x5,x6,[x2]
+	ldp	x7,x8,[x2,#16]
+	ldp	x9,x10,[x2,#32]
+
+	add	x2,x1,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
+
+	adds	x19,x11,x11	// add with itself
+	adcs	x20,x12,x12
+	adcs	x21,x13,x13
+	adcs	x22,x14,x14
+	adcs	x23,x15,x15
+	adc	x24,x16,x16
+
+	stp	x19,x20,[x2,#48]
+	stp	x21,x22,[x2,#64]
+	stp	x23,x24,[x2,#80]
+
+	ldp	x11,x12,[sp]
+	ldr	x17,[sp,#48]
+	ldp	x13,x14,[sp,#16]
+	ldp	x15,x16,[sp,#32]
+
+	add	x2,sp,#48
+	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
+	ldr	x30,[x29,#8]
+
+	ldr	x25,[sp,#96]	// account for sign from a->re - a->im
+	ldp	x19,x20,[sp]
+	ldp	x21,x22,[sp,#16]
+	ldp	x23,x24,[sp,#32]
+
+	and	x19,x19,x25
+	and	x20,x20,x25
+	and	x21,x21,x25
+	and	x22,x22,x25
+	and	x23,x23,x25
+	and	x24,x24,x25
+
+	subs	x11,x11,x19
+	sbcs	x12,x12,x20
+	sbcs	x13,x13,x21
+	sbcs	x14,x14,x22
+	sbcs	x15,x15,x23
+	sbcs	x16,x16,x24
+	sbc	x25,xzr,xzr
+
+	and	x19,x5,x25
+	and	x20,x6,x25
+	and	x21,x7,x25
+	and	x22,x8,x25
+	and	x23,x9,x25
+	and	x24,x10,x25
+
+	adds	x11,x11,x19
+	adcs	x12,x12,x20
+	adcs	x13,x13,x21
+	adcs	x14,x14,x22
+	adcs	x15,x15,x23
+	adc	x16,x16,x24
+
+	stp	x11,x12,[x2]
+	stp	x13,x14,[x2,#16]
+	stp	x15,x16,[x2,#32]
+
+	add	sp,sp,#112
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+	ALIGN	32
+|__mul_mont_383_nonred| PROC
+	mul	x19,x11,x17
+	mul	x20,x12,x17
+	mul	x21,x13,x17
+	mul	x22,x14,x17
+	mul	x23,x15,x17
+	mul	x24,x16,x17
+	mul	x4,x4,x19
+
+	umulh	x26,x11,x17
+	umulh	x27,x12,x17
+	umulh	x28,x13,x17
+	umulh	x0,x14,x17
+	umulh	x1,x15,x17
+	umulh	x3,x16,x17
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,xzr,    x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*1]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*2]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*3]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*4]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	ldr	x17,[x2,8*5]
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+
+	ldr	x4,[x29,#96]
+	adds	x19,x20,x26
+	mul	x26,x11,x17
+	adcs	x20,x21,x27
+	mul	x27,x12,x17
+	adcs	x21,x22,x28
+	mul	x28,x13,x17
+	adcs	x22,x23,x0
+	mul	x0,x14,x17
+	adcs	x23,x24,x1
+	mul	x1,x15,x17
+	adcs	x24,x25,x3
+	mul	x3,x16,x17
+	adc	x25,xzr,xzr
+
+	adds	x19,x19,x26
+	umulh	x26,x11,x17
+	adcs	x20,x20,x27
+	umulh	x27,x12,x17
+	adcs	x21,x21,x28
+	mul	x4,x4,x19
+	umulh	x28,x13,x17
+	adcs	x22,x22,x0
+	umulh	x0,x14,x17
+	adcs	x23,x23,x1
+	umulh	x1,x15,x17
+	adcs	x24,x24,x3
+	umulh	x3,x16,x17
+	adc	x25,x25,xzr
+
+	adds	x20,x20,x26
+	mul	x26,x5,x4
+	adcs	x21,x21,x27
+	mul	x27,x6,x4
+	adcs	x22,x22,x28
+	mul	x28,x7,x4
+	adcs	x23,x23,x0
+	mul	x0,x8,x4
+	adcs	x24,x24,x1
+	mul	x1,x9,x4
+	adc	x25,x25,x3
+	mul	x3,x10,x4
+	adds	x19,x19,x26
+	umulh	x26,x5,x4
+	adcs	x20,x20,x27
+	umulh	x27,x6,x4
+	adcs	x21,x21,x28
+	umulh	x28,x7,x4
+	adcs	x22,x22,x0
+	umulh	x0,x8,x4
+	adcs	x23,x23,x1
+	umulh	x1,x9,x4
+	adcs	x24,x24,x3
+	umulh	x3,x10,x4
+	adc	x25,x25,xzr
+	ldp	x4,x2,[x29,#96]		// pull r_ptr
+
+	adds	x11,x20,x26
+	adcs	x12,x21,x27
+	adcs	x13,x22,x28
+	adcs	x14,x23,x0
+	adcs	x15,x24,x1
+	adcs	x16,x25,x3
+
+	ret
+	ENDP
+
+
+
+	EXPORT	|sgn0_pty_mont_384|[FUNC]
+	ALIGN	32
+|sgn0_pty_mont_384| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	adds	x11,x11,x11
+	adcs	x12,x12,x12
+	adcs	x13,x13,x13
+	adcs	x14,x14,x14
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+
+
+
+	EXPORT	|sgn0_pty_mont_384x|[FUNC]
+	ALIGN	32
+|sgn0_pty_mont_384x| PROC
+	DCDU	3573752639
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	mov	x4,x2
+	ldp	x5,x6,[x1]
+	ldp	x7,x8,[x1,#16]
+	ldp	x9,x10,[x1,#32]
+	mov	x1,x0
+
+	bl	__mul_by_1_mont_384
+	add	x1,x1,#48
+
+	and	x2,x11,#1
+	orr	x3,x11,x12
+	adds	x11,x11,x11
+	orr	x3,x3,x13
+	adcs	x12,x12,x12
+	orr	x3,x3,x14
+	adcs	x13,x13,x13
+	orr	x3,x3,x15
+	adcs	x14,x14,x14
+	orr	x3,x3,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x2,x2,x17
+
+	bl	__mul_by_1_mont_384
+	ldr	x30,[x29,#8]
+
+	and	x0,x11,#1
+	orr	x1,x11,x12
+	adds	x11,x11,x11
+	orr	x1,x1,x13
+	adcs	x12,x12,x12
+	orr	x1,x1,x14
+	adcs	x13,x13,x13
+	orr	x1,x1,x15
+	adcs	x14,x14,x14
+	orr	x1,x1,x16
+	adcs	x15,x15,x15
+	adcs	x16,x16,x16
+	adc	x17,xzr,xzr
+
+	subs	x11,x11,x5
+	sbcs	x12,x12,x6
+	sbcs	x13,x13,x7
+	sbcs	x14,x14,x8
+	sbcs	x15,x15,x9
+	sbcs	x16,x16,x10
+	sbc	x17,x17,xzr
+
+	mvn	x17,x17
+	and	x17,x17,#2
+	orr	x0,x0,x17
+
+	cmp	x3,#0
+	cseleq	x3,x0,x2
+
+	cmp	x1,#0
+	cselne	x1,x0,x2
+
+	and	x3,x3,#1
+	and	x1,x1,#2
+	orr	x0,x1,x3		// pack sign and parity
+
+	ldp	x19,x20,[x29,#16]
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	DCDU	3573752767
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
new file mode 100644
index 00000000000..c3bf8634617
--- /dev/null
+++ b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
@@ -0,0 +1,884 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	mul_mont_sparse_256
+
+
+ALIGN	32
+mul_mont_sparse_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_mont_sparse_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_mul_mont_sparse_256::
+
+
+	mov	rax,QWORD PTR[rdx]
+	mov	r13,QWORD PTR[rsi]
+	mov	r14,QWORD PTR[8+rsi]
+	mov	r12,QWORD PTR[16+rsi]
+	mov	rbp,QWORD PTR[24+rsi]
+	mov	rbx,rdx
+
+	mov	r15,rax
+	mul	r13
+	mov	r9,rax
+	mov	rax,r15
+	mov	r10,rdx
+	call	__mulq_mont_sparse_256
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mul_mont_sparse_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_mont_sparse_256::
+mul_mont_sparse_256	ENDP
+
+PUBLIC	sqr_mont_sparse_256
+
+
+ALIGN	32
+sqr_mont_sparse_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_mont_sparse_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_sqr_mont_sparse_256::
+
+
+	mov	rax,QWORD PTR[rsi]
+	mov	r8,rcx
+	mov	r14,QWORD PTR[8+rsi]
+	mov	rcx,rdx
+	mov	r12,QWORD PTR[16+rsi]
+	lea	rbx,QWORD PTR[rsi]
+	mov	rbp,QWORD PTR[24+rsi]
+
+	mov	r15,rax
+	mul	rax
+	mov	r9,rax
+	mov	rax,r15
+	mov	r10,rdx
+	call	__mulq_mont_sparse_256
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqr_mont_sparse_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_mont_sparse_256::
+sqr_mont_sparse_256	ENDP
+
+ALIGN	32
+__mulq_mont_sparse_256	PROC PRIVATE
+	DB	243,15,30,250
+	mul	r14
+	add	r10,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	r12
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	rbp
+	add	r12,rax
+	mov	rax,QWORD PTR[8+rbx]
+	adc	rdx,0
+	xor	r14,r14
+	mov	r13,rdx
+
+	mov	rdi,r9
+	imul	r9,r8
+
+
+	mov	r15,rax
+	mul	QWORD PTR[rsi]
+	add	r10,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	add	r11,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r12,rax
+	mov	rax,r15
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,rbp
+	adc	r14,rdx
+	xor	r15,r15
+
+
+	mul	QWORD PTR[rcx]
+	add	rdi,rax
+	mov	rax,r9
+	adc	rdi,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r10,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r10,rdi
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r11,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r12,rax
+	mov	rax,QWORD PTR[16+rbx]
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	add	r13,rdx
+	adc	r14,0
+	adc	r15,0
+	mov	rdi,r10
+	imul	r10,r8
+
+
+	mov	r9,rax
+	mul	QWORD PTR[rsi]
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r12,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,rbp
+	adc	r15,rdx
+	xor	r9,r9
+
+
+	mul	QWORD PTR[rcx]
+	add	rdi,rax
+	mov	rax,r10
+	adc	rdi,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,rdi
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r13,rax
+	mov	rax,QWORD PTR[24+rbx]
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	add	r14,rdx
+	adc	r15,0
+	adc	r9,0
+	mov	rdi,r11
+	imul	r11,r8
+
+
+	mov	r10,rax
+	mul	QWORD PTR[rsi]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r13,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r15,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r15,rbp
+	adc	r9,rdx
+	xor	r10,r10
+
+
+	mul	QWORD PTR[rcx]
+	add	rdi,rax
+	mov	rax,r11
+	adc	rdi,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,rdi
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	add	r15,rdx
+	adc	r9,0
+	adc	r10,0
+	imul	rax,r8
+	mov	rsi,QWORD PTR[8+rsp]
+
+
+	mov	r11,rax
+	mul	QWORD PTR[rcx]
+	add	r12,rax
+	mov	rax,r11
+	adc	r12,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r12
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r14,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	mov	rbx,r14
+	add	r15,rbp
+	adc	rdx,0
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r9,rdx
+	adc	r10,0
+
+
+
+
+	mov	r12,r15
+	sub	r13,QWORD PTR[rcx]
+	sbb	r14,QWORD PTR[8+rcx]
+	sbb	r15,QWORD PTR[16+rcx]
+	mov	rbp,r9
+	sbb	r9,QWORD PTR[24+rcx]
+	sbb	r10,0
+
+	cmovc	r13,rax
+	cmovc	r14,rbx
+	cmovc	r15,r12
+	mov	QWORD PTR[rsi],r13
+	cmovc	r9,rbp
+	mov	QWORD PTR[8+rsi],r14
+	mov	QWORD PTR[16+rsi],r15
+	mov	QWORD PTR[24+rsi],r9
+
+	DB	0F3h,0C3h		;repret
+
+__mulq_mont_sparse_256	ENDP
+PUBLIC	from_mont_256
+
+
+ALIGN	32
+from_mont_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_from_mont_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_from_mont_256::
+
+
+	mov	rbx,rdx
+	call	__mulq_by_1_mont_256
+
+
+
+
+
+	mov	r10,r14
+	mov	r11,r15
+	mov	r12,r9
+
+	sub	r13,QWORD PTR[rbx]
+	sbb	r14,QWORD PTR[8+rbx]
+	sbb	r15,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+
+	cmovnc	rax,r13
+	cmovnc	r10,r14
+	cmovnc	r11,r15
+	mov	QWORD PTR[rdi],rax
+	cmovnc	r12,r9
+	mov	QWORD PTR[8+rdi],r10
+	mov	QWORD PTR[16+rdi],r11
+	mov	QWORD PTR[24+rdi],r12
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_from_mont_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_from_mont_256::
+from_mont_256	ENDP
+
+PUBLIC	redc_mont_256
+
+
+ALIGN	32
+redc_mont_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_redc_mont_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_redc_mont_256::
+
+
+	mov	rbx,rdx
+	call	__mulq_by_1_mont_256
+
+	add	r13,QWORD PTR[32+rsi]
+	adc	r14,QWORD PTR[40+rsi]
+	mov	rax,r13
+	adc	r15,QWORD PTR[48+rsi]
+	mov	r10,r14
+	adc	r9,QWORD PTR[56+rsi]
+	sbb	rsi,rsi
+
+
+
+
+	mov	r11,r15
+	sub	r13,QWORD PTR[rbx]
+	sbb	r14,QWORD PTR[8+rbx]
+	sbb	r15,QWORD PTR[16+rbx]
+	mov	r12,r9
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	rsi,0
+
+	cmovnc	rax,r13
+	cmovnc	r10,r14
+	cmovnc	r11,r15
+	mov	QWORD PTR[rdi],rax
+	cmovnc	r12,r9
+	mov	QWORD PTR[8+rdi],r10
+	mov	QWORD PTR[16+rdi],r11
+	mov	QWORD PTR[24+rdi],r12
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_redc_mont_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_redc_mont_256::
+redc_mont_256	ENDP
+
+ALIGN	32
+__mulq_by_1_mont_256	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rax,QWORD PTR[rsi]
+	mov	r10,QWORD PTR[8+rsi]
+	mov	r11,QWORD PTR[16+rsi]
+	mov	r12,QWORD PTR[24+rsi]
+
+	mov	r13,rax
+	imul	rax,rcx
+	mov	r9,rax
+
+	mul	QWORD PTR[rbx]
+	add	r13,rax
+	mov	rax,r9
+	adc	r13,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r10,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r10,r13
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r14,r10
+	imul	r10,rcx
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r11,r13
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,r13
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r14,rax
+	mov	rax,r10
+	adc	r14,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r15,r11
+	imul	r11,rcx
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r15,rax
+	mov	rax,r11
+	adc	r15,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r9,r12
+	imul	r12,rcx
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r9,rax
+	mov	rax,r12
+	adc	r9,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r13,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r13,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,r9
+	adc	rdx,0
+	mov	r9,rdx
+	DB	0F3h,0C3h		;repret
+__mulq_by_1_mont_256	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_mul_mont_sparse_256
+	DD	imagerel $L$SEH_body_mul_mont_sparse_256
+	DD	imagerel $L$SEH_info_mul_mont_sparse_256_prologue
+
+	DD	imagerel $L$SEH_body_mul_mont_sparse_256
+	DD	imagerel $L$SEH_epilogue_mul_mont_sparse_256
+	DD	imagerel $L$SEH_info_mul_mont_sparse_256_body
+
+	DD	imagerel $L$SEH_epilogue_mul_mont_sparse_256
+	DD	imagerel $L$SEH_end_mul_mont_sparse_256
+	DD	imagerel $L$SEH_info_mul_mont_sparse_256_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_body_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqr_mont_sparse_256_prologue
+
+	DD	imagerel $L$SEH_body_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_epilogue_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqr_mont_sparse_256_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_end_sqr_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue
+
+	DD	imagerel $L$SEH_begin_from_mont_256
+	DD	imagerel $L$SEH_body_from_mont_256
+	DD	imagerel $L$SEH_info_from_mont_256_prologue
+
+	DD	imagerel $L$SEH_body_from_mont_256
+	DD	imagerel $L$SEH_epilogue_from_mont_256
+	DD	imagerel $L$SEH_info_from_mont_256_body
+
+	DD	imagerel $L$SEH_epilogue_from_mont_256
+	DD	imagerel $L$SEH_end_from_mont_256
+	DD	imagerel $L$SEH_info_from_mont_256_epilogue
+
+	DD	imagerel $L$SEH_begin_redc_mont_256
+	DD	imagerel $L$SEH_body_redc_mont_256
+	DD	imagerel $L$SEH_info_redc_mont_256_prologue
+
+	DD	imagerel $L$SEH_body_redc_mont_256
+	DD	imagerel $L$SEH_epilogue_redc_mont_256
+	DD	imagerel $L$SEH_info_redc_mont_256_body
+
+	DD	imagerel $L$SEH_epilogue_redc_mont_256
+	DD	imagerel $L$SEH_end_redc_mont_256
+	DD	imagerel $L$SEH_info_redc_mont_256_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_mul_mont_sparse_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_mont_sparse_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mul_mont_sparse_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_mont_sparse_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_mont_sparse_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqr_mont_sparse_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_from_mont_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_from_mont_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_from_mont_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_redc_mont_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_redc_mont_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_redc_mont_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
new file mode 100644
index 00000000000..0ccb46786c3
--- /dev/null
+++ b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
@@ -0,0 +1,4233 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+
+
+
+
+
+
+
+ALIGN	32
+__sub_mod_384x384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[56+rsi]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	rax,QWORD PTR[64+rsi]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[72+rsi]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[80+rsi]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rsi,QWORD PTR[88+rsi]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	QWORD PTR[rdi],r8
+	sbb	r14,QWORD PTR[48+rdx]
+	mov	r8,QWORD PTR[rcx]
+	mov	QWORD PTR[8+rdi],r9
+	sbb	r15,QWORD PTR[56+rdx]
+	mov	r9,QWORD PTR[8+rcx]
+	mov	QWORD PTR[16+rdi],r10
+	sbb	rax,QWORD PTR[64+rdx]
+	mov	r10,QWORD PTR[16+rcx]
+	mov	QWORD PTR[24+rdi],r11
+	sbb	rbx,QWORD PTR[72+rdx]
+	mov	r11,QWORD PTR[24+rcx]
+	mov	QWORD PTR[32+rdi],r12
+	sbb	rbp,QWORD PTR[80+rdx]
+	mov	r12,QWORD PTR[32+rcx]
+	mov	QWORD PTR[40+rdi],r13
+	sbb	rsi,QWORD PTR[88+rdx]
+	mov	r13,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r8,rdx
+	and	r9,rdx
+	and	r10,rdx
+	and	r11,rdx
+	and	r12,rdx
+	and	r13,rdx
+
+	add	r14,r8
+	adc	r15,r9
+	mov	QWORD PTR[48+rdi],r14
+	adc	rax,r10
+	mov	QWORD PTR[56+rdi],r15
+	adc	rbx,r11
+	mov	QWORD PTR[64+rdi],rax
+	adc	rbp,r12
+	mov	QWORD PTR[72+rdi],rbx
+	adc	rsi,r13
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rsi
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384x384	ENDP
+
+
+ALIGN	32
+__add_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	adc	r10,QWORD PTR[16+rdx]
+	mov	r14,r8
+	adc	r11,QWORD PTR[24+rdx]
+	mov	r15,r9
+	adc	r12,QWORD PTR[32+rdx]
+	mov	rax,r10
+	adc	r13,QWORD PTR[40+rdx]
+	mov	rbx,r11
+	sbb	rdx,rdx
+
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	mov	rbp,r12
+	sbb	r10,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rcx]
+	mov	rsi,r13
+	sbb	r13,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	mov	QWORD PTR[rdi],r8
+	cmovc	r11,rbx
+	mov	QWORD PTR[8+rdi],r9
+	cmovc	r12,rbp
+	mov	QWORD PTR[16+rdi],r10
+	cmovc	r13,rsi
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__add_mod_384	ENDP
+
+
+ALIGN	32
+__sub_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+__sub_mod_384_a_is_loaded::
+	sub	r8,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	r15,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rax,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbx,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rbp,QWORD PTR[32+rcx]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	rsi,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r14,rdx
+	and	r15,rdx
+	and	rax,rdx
+	and	rbx,rdx
+	and	rbp,rdx
+	and	rsi,rdx
+
+	add	r8,r14
+	adc	r9,r15
+	mov	QWORD PTR[rdi],r8
+	adc	r10,rax
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,rbx
+	mov	QWORD PTR[16+rdi],r10
+	adc	r12,rbp
+	mov	QWORD PTR[24+rdi],r11
+	adc	r13,rsi
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384	ENDP
+PUBLIC	mul_mont_384x
+
+
+ALIGN	32
+mul_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,328
+
+$L$SEH_body_mul_mont_384x::
+
+
+	mov	rbx,rdx
+	mov	QWORD PTR[32+rsp],rdi
+	mov	QWORD PTR[24+rsp],rsi
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+	mov	QWORD PTR[rsp],r8
+
+
+
+
+	lea	rdi,QWORD PTR[40+rsp]
+	call	__mulq_384
+
+
+	lea	rbx,QWORD PTR[48+rbx]
+	lea	rsi,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[((40+96))+rsp]
+	call	__mulq_384
+
+
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rdx,QWORD PTR[((-48))+rsi]
+	lea	rdi,QWORD PTR[((40+192+48))+rsp]
+	call	__add_mod_384
+
+	mov	rsi,QWORD PTR[16+rsp]
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__add_mod_384
+
+	lea	rbx,QWORD PTR[rdi]
+	lea	rsi,QWORD PTR[48+rdi]
+	call	__mulq_384
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[40+rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	call	__sub_mod_384x384
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[40+rsp]
+	lea	rdx,QWORD PTR[((40+96))+rsp]
+	lea	rdi,QWORD PTR[40+rsp]
+	call	__sub_mod_384x384
+
+	mov	rbx,rcx
+
+
+	lea	rsi,QWORD PTR[40+rsp]
+	mov	rcx,QWORD PTR[rsp]
+	mov	rdi,QWORD PTR[32+rsp]
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	lea	rsi,QWORD PTR[((40+192))+rsp]
+	mov	rcx,QWORD PTR[rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	r8,QWORD PTR[328+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_mul_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_mont_384x::
+mul_mont_384x	ENDP
+PUBLIC	sqr_mont_384x
+
+
+ALIGN	32
+sqr_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_sqr_mont_384x::
+
+
+	mov	QWORD PTR[rsp],rcx
+	mov	rcx,rdx
+	mov	QWORD PTR[8+rsp],rdi
+	mov	QWORD PTR[16+rsp],rsi
+
+
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[32+rsp]
+	call	__add_mod_384
+
+
+	mov	rsi,QWORD PTR[16+rsp]
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[((32+48))+rsp]
+	call	__sub_mod_384
+
+
+	mov	rsi,QWORD PTR[16+rsp]
+	lea	rbx,QWORD PTR[48+rsi]
+
+	mov	rax,QWORD PTR[48+rsi]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	r12,QWORD PTR[16+rsi]
+	mov	r13,QWORD PTR[24+rsi]
+
+	call	__mulq_mont_384
+	add	r14,r14
+	adc	r15,r15
+	adc	r8,r8
+	mov	r12,r14
+	adc	r9,r9
+	mov	r13,r15
+	adc	r10,r10
+	mov	rax,r8
+	adc	r11,r11
+	mov	rbx,r9
+	sbb	rdx,rdx
+
+	sub	r14,QWORD PTR[rcx]
+	sbb	r15,QWORD PTR[8+rcx]
+	mov	rbp,r10
+	sbb	r8,QWORD PTR[16+rcx]
+	sbb	r9,QWORD PTR[24+rcx]
+	sbb	r10,QWORD PTR[32+rcx]
+	mov	rsi,r11
+	sbb	r11,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r14,r12
+	cmovc	r15,r13
+	cmovc	r8,rax
+	mov	QWORD PTR[48+rdi],r14
+	cmovc	r9,rbx
+	mov	QWORD PTR[56+rdi],r15
+	cmovc	r10,rbp
+	mov	QWORD PTR[64+rdi],r8
+	cmovc	r11,rsi
+	mov	QWORD PTR[72+rdi],r9
+	mov	QWORD PTR[80+rdi],r10
+	mov	QWORD PTR[88+rdi],r11
+
+	lea	rsi,QWORD PTR[32+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+
+	mov	rax,QWORD PTR[((32+48))+rsp]
+	mov	r14,QWORD PTR[((32+0))+rsp]
+	mov	r15,QWORD PTR[((32+8))+rsp]
+	mov	r12,QWORD PTR[((32+16))+rsp]
+	mov	r13,QWORD PTR[((32+24))+rsp]
+
+	call	__mulq_mont_384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqr_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_mont_384x::
+sqr_mont_384x	ENDP
+
+PUBLIC	mul_382x
+
+
+ALIGN	32
+mul_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_mul_382x::
+
+
+	lea	rdi,QWORD PTR[96+rdi]
+	mov	QWORD PTR[rsp],rsi
+	mov	QWORD PTR[8+rsp],rdx
+	mov	QWORD PTR[16+rsp],rdi
+	mov	QWORD PTR[24+rsp],rcx
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	add	r8,QWORD PTR[48+rsi]
+	adc	r9,QWORD PTR[56+rsi]
+	adc	r10,QWORD PTR[64+rsi]
+	adc	r11,QWORD PTR[72+rsi]
+	adc	r12,QWORD PTR[80+rsi]
+	adc	r13,QWORD PTR[88+rsi]
+
+	mov	QWORD PTR[((32+0))+rsp],r8
+	mov	QWORD PTR[((32+8))+rsp],r9
+	mov	QWORD PTR[((32+16))+rsp],r10
+	mov	QWORD PTR[((32+24))+rsp],r11
+	mov	QWORD PTR[((32+32))+rsp],r12
+	mov	QWORD PTR[((32+40))+rsp],r13
+
+
+	mov	r8,QWORD PTR[rdx]
+	mov	r9,QWORD PTR[8+rdx]
+	mov	r10,QWORD PTR[16+rdx]
+	mov	r11,QWORD PTR[24+rdx]
+	mov	r12,QWORD PTR[32+rdx]
+	mov	r13,QWORD PTR[40+rdx]
+
+	add	r8,QWORD PTR[48+rdx]
+	adc	r9,QWORD PTR[56+rdx]
+	adc	r10,QWORD PTR[64+rdx]
+	adc	r11,QWORD PTR[72+rdx]
+	adc	r12,QWORD PTR[80+rdx]
+	adc	r13,QWORD PTR[88+rdx]
+
+	mov	QWORD PTR[((32+48))+rsp],r8
+	mov	QWORD PTR[((32+56))+rsp],r9
+	mov	QWORD PTR[((32+64))+rsp],r10
+	mov	QWORD PTR[((32+72))+rsp],r11
+	mov	QWORD PTR[((32+80))+rsp],r12
+	mov	QWORD PTR[((32+88))+rsp],r13
+
+
+	lea	rsi,QWORD PTR[((32+0))+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+	call	__mulq_384
+
+
+	mov	rsi,QWORD PTR[rsp]
+	mov	rbx,QWORD PTR[8+rsp]
+	lea	rdi,QWORD PTR[((-96))+rdi]
+	call	__mulq_384
+
+
+	lea	rsi,QWORD PTR[48+rsi]
+	lea	rbx,QWORD PTR[48+rbx]
+	lea	rdi,QWORD PTR[32+rsp]
+	call	__mulq_384
+
+
+	mov	rsi,QWORD PTR[16+rsp]
+	lea	rdx,QWORD PTR[32+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	mov	rdi,rsi
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[((-96))+rdi]
+	lea	rdx,QWORD PTR[32+rsp]
+	lea	rdi,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_mul_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_382x::
+mul_382x	ENDP
+PUBLIC	sqr_382x
+
+
+ALIGN	32
+sqr_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rsi
+
+$L$SEH_body_sqr_382x::
+
+
+	mov	rcx,rdx
+
+
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	rbx,QWORD PTR[24+rsi]
+	mov	rbp,QWORD PTR[32+rsi]
+	mov	rdx,QWORD PTR[40+rsi]
+
+	mov	r8,r14
+	add	r14,QWORD PTR[48+rsi]
+	mov	r9,r15
+	adc	r15,QWORD PTR[56+rsi]
+	mov	r10,rax
+	adc	rax,QWORD PTR[64+rsi]
+	mov	r11,rbx
+	adc	rbx,QWORD PTR[72+rsi]
+	mov	r12,rbp
+	adc	rbp,QWORD PTR[80+rsi]
+	mov	r13,rdx
+	adc	rdx,QWORD PTR[88+rsi]
+
+	mov	QWORD PTR[rdi],r14
+	mov	QWORD PTR[8+rdi],r15
+	mov	QWORD PTR[16+rdi],rax
+	mov	QWORD PTR[24+rdi],rbx
+	mov	QWORD PTR[32+rdi],rbp
+	mov	QWORD PTR[40+rdi],rdx
+
+
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__sub_mod_384_a_is_loaded
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rbx,QWORD PTR[((-48))+rdi]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__mulq_384
+
+
+	mov	rsi,QWORD PTR[rsp]
+	lea	rbx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[96+rdi]
+	call	__mulq_384
+
+	mov	r8,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	mov	r10,QWORD PTR[16+rdi]
+	mov	r11,QWORD PTR[24+rdi]
+	mov	r12,QWORD PTR[32+rdi]
+	mov	r13,QWORD PTR[40+rdi]
+	mov	r14,QWORD PTR[48+rdi]
+	mov	r15,QWORD PTR[56+rdi]
+	mov	rax,QWORD PTR[64+rdi]
+	mov	rbx,QWORD PTR[72+rdi]
+	mov	rbp,QWORD PTR[80+rdi]
+	add	r8,r8
+	mov	rdx,QWORD PTR[88+rdi]
+	adc	r9,r9
+	mov	QWORD PTR[rdi],r8
+	adc	r10,r10
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,r11
+	mov	QWORD PTR[16+rdi],r10
+	adc	r12,r12
+	mov	QWORD PTR[24+rdi],r11
+	adc	r13,r13
+	mov	QWORD PTR[32+rdi],r12
+	adc	r14,r14
+	mov	QWORD PTR[40+rdi],r13
+	adc	r15,r15
+	mov	QWORD PTR[48+rdi],r14
+	adc	rax,rax
+	mov	QWORD PTR[56+rdi],r15
+	adc	rbx,rbx
+	mov	QWORD PTR[64+rdi],rax
+	adc	rbp,rbp
+	mov	QWORD PTR[72+rdi],rbx
+	adc	rdx,rdx
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rdx
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqr_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_382x::
+sqr_382x	ENDP
+PUBLIC	mul_384
+
+
+ALIGN	32
+mul_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+$L$SEH_body_mul_384::
+
+
+	mov	rbx,rdx
+	call	__mulq_384
+
+	mov	r12,QWORD PTR[rsp]
+
+	mov	rbx,QWORD PTR[8+rsp]
+
+	mov	rbp,QWORD PTR[16+rsp]
+
+	lea	rsp,QWORD PTR[24+rsp]
+
+$L$SEH_epilogue_mul_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_384::
+mul_384	ENDP
+
+
+ALIGN	32
+__mulq_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rax,QWORD PTR[rbx]
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	mov	QWORD PTR[rdi],rax
+	mov	rax,rbp
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r11,rax
+	mov	rax,QWORD PTR[8+rbx]
+	adc	rdx,0
+	mov	r12,rdx
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[8+rdi],rcx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r12,rax
+	mov	rax,QWORD PTR[16+rbx]
+	adc	rdx,0
+	add	r11,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[16+rdi],rcx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r12,rax
+	mov	rax,QWORD PTR[24+rbx]
+	adc	rdx,0
+	add	r11,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[24+rdi],rcx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r12,rax
+	mov	rax,QWORD PTR[32+rbx]
+	adc	rdx,0
+	add	r11,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[32+rdi],rcx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r12,rax
+	mov	rax,QWORD PTR[40+rbx]
+	adc	rdx,0
+	add	r11,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	rcx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[40+rdi],rcx
+	mov	rcx,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r12,rax
+	mov	rax,rax
+	adc	rdx,0
+	add	r11,r12
+	adc	rdx,0
+	mov	r12,rdx
+	mov	QWORD PTR[48+rdi],rcx
+	mov	QWORD PTR[56+rdi],r8
+	mov	QWORD PTR[64+rdi],r9
+	mov	QWORD PTR[72+rdi],r10
+	mov	QWORD PTR[80+rdi],r11
+	mov	QWORD PTR[88+rdi],r12
+
+	DB	0F3h,0C3h		;repret
+__mulq_384	ENDP
+PUBLIC	sqr_384
+
+
+ALIGN	32
+sqr_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sqr_384::
+
+
+	call	__sqrq_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqr_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_384::
+sqr_384	ENDP
+
+
+ALIGN	32
+__sqrq_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rax,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rcx,QWORD PTR[16+rsi]
+	mov	rbx,QWORD PTR[24+rsi]
+
+
+	mov	r14,rax
+	mul	r15
+	mov	r9,rax
+	mov	rax,r14
+	mov	rbp,QWORD PTR[32+rsi]
+	mov	r10,rdx
+
+	mul	rcx
+	add	r10,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	rsi,QWORD PTR[40+rsi]
+	mov	r11,rdx
+
+	mul	rbx
+	add	r11,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	rbp
+	add	r12,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	rsi
+	add	r13,rax
+	mov	rax,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	rax
+	xor	r8,r8
+	mov	QWORD PTR[rdi],rax
+	mov	rax,r15
+	add	r9,r9
+	adc	r8,0
+	add	r9,rdx
+	adc	r8,0
+	mov	QWORD PTR[8+rdi],r9
+
+	mul	rcx
+	add	r11,rax
+	mov	rax,r15
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	rbx
+	add	r12,rax
+	mov	rax,r15
+	adc	rdx,0
+	add	r12,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	rbp
+	add	r13,rax
+	mov	rax,r15
+	adc	rdx,0
+	add	r13,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	rsi
+	add	r14,rax
+	mov	rax,r15
+	adc	rdx,0
+	add	r14,r9
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	rax
+	xor	r9,r9
+	add	r8,rax
+	mov	rax,rcx
+	add	r10,r10
+	adc	r11,r11
+	adc	r9,0
+	add	r10,r8
+	adc	r11,rdx
+	adc	r9,0
+	mov	QWORD PTR[16+rdi],r10
+
+	mul	rbx
+	add	r13,rax
+	mov	rax,rcx
+	adc	rdx,0
+	mov	QWORD PTR[24+rdi],r11
+	mov	r8,rdx
+
+	mul	rbp
+	add	r14,rax
+	mov	rax,rcx
+	adc	rdx,0
+	add	r14,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	rsi
+	add	r15,rax
+	mov	rax,rcx
+	adc	rdx,0
+	add	r15,r8
+	adc	rdx,0
+	mov	rcx,rdx
+
+	mul	rax
+	xor	r11,r11
+	add	r9,rax
+	mov	rax,rbx
+	add	r12,r12
+	adc	r13,r13
+	adc	r11,0
+	add	r12,r9
+	adc	r13,rdx
+	adc	r11,0
+	mov	QWORD PTR[32+rdi],r12
+
+
+	mul	rbp
+	add	r15,rax
+	mov	rax,rbx
+	adc	rdx,0
+	mov	QWORD PTR[40+rdi],r13
+	mov	r8,rdx
+
+	mul	rsi
+	add	rcx,rax
+	mov	rax,rbx
+	adc	rdx,0
+	add	rcx,r8
+	adc	rdx,0
+	mov	rbx,rdx
+
+	mul	rax
+	xor	r12,r12
+	add	r11,rax
+	mov	rax,rbp
+	add	r14,r14
+	adc	r15,r15
+	adc	r12,0
+	add	r14,r11
+	adc	r15,rdx
+	mov	QWORD PTR[48+rdi],r14
+	adc	r12,0
+	mov	QWORD PTR[56+rdi],r15
+
+
+	mul	rsi
+	add	rbx,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	rax
+	xor	r13,r13
+	add	r12,rax
+	mov	rax,rsi
+	add	rcx,rcx
+	adc	rbx,rbx
+	adc	r13,0
+	add	rcx,r12
+	adc	rbx,rdx
+	mov	QWORD PTR[64+rdi],rcx
+	adc	r13,0
+	mov	QWORD PTR[72+rdi],rbx
+
+
+	mul	rax
+	add	rax,r13
+	add	rbp,rbp
+	adc	rdx,0
+	add	rax,rbp
+	adc	rdx,0
+	mov	QWORD PTR[80+rdi],rax
+	mov	QWORD PTR[88+rdi],rdx
+
+	DB	0F3h,0C3h		;repret
+__sqrq_384	ENDP
+
+PUBLIC	sqr_mont_384
+
+
+ALIGN	32
+sqr_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8*15
+
+$L$SEH_body_sqr_mont_384::
+
+
+	mov	QWORD PTR[96+rsp],rcx
+	mov	QWORD PTR[104+rsp],rdx
+	mov	QWORD PTR[112+rsp],rdi
+
+	mov	rdi,rsp
+	call	__sqrq_384
+
+	lea	rsi,QWORD PTR[rsp]
+	mov	rcx,QWORD PTR[96+rsp]
+	mov	rbx,QWORD PTR[104+rsp]
+	mov	rdi,QWORD PTR[112+rsp]
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	r8,QWORD PTR[120+rsp]
+	mov	r15,QWORD PTR[120+rsp]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqr_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_mont_384::
+sqr_mont_384	ENDP
+
+
+
+PUBLIC	redc_mont_384
+
+
+ALIGN	32
+redc_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_redc_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_redc_mont_384::
+
+
+	mov	rbx,rdx
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_redc_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_redc_mont_384::
+redc_mont_384	ENDP
+
+
+
+
+PUBLIC	from_mont_384
+
+
+ALIGN	32
+from_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_from_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_from_mont_384::
+
+
+	mov	rbx,rdx
+	call	__mulq_by_1_mont_384
+
+
+
+
+
+	mov	rcx,r15
+	mov	rdx,r8
+	mov	rbp,r9
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	mov	r13,r10
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	mov	rsi,r11
+	sbb	r11,QWORD PTR[40+rbx]
+
+	cmovc	r14,rax
+	cmovc	r15,rcx
+	cmovc	r8,rdx
+	mov	QWORD PTR[rdi],r14
+	cmovc	r9,rbp
+	mov	QWORD PTR[8+rdi],r15
+	cmovc	r10,r13
+	mov	QWORD PTR[16+rdi],r8
+	cmovc	r11,rsi
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_from_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_from_mont_384::
+from_mont_384	ENDP
+
+ALIGN	32
+__mulq_by_1_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rax,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,rax
+	imul	rax,rcx
+	mov	r8,rax
+
+	mul	QWORD PTR[rbx]
+	add	r14,rax
+	mov	rax,r8
+	adc	r14,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r9,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r9,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r10,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r15,r9
+	imul	r9,rcx
+	add	r11,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r15,rax
+	mov	rax,r9
+	adc	r15,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r10,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r10,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r11,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r12,rax
+	mov	rax,r9
+	adc	rdx,0
+	mov	r8,r10
+	imul	r10,rcx
+	add	r12,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r8,rax
+	mov	rax,r10
+	adc	r8,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r13,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r9,r11
+	imul	r11,rcx
+	add	r13,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r15,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r15,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r9,rax
+	mov	rax,r11
+	adc	r9,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r14,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r10,r12
+	imul	r12,rcx
+	add	r14,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r15,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r15,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r8,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r8,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r10,rax
+	mov	rax,r12
+	adc	r10,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r13,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r15,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r11,r13
+	imul	r13,rcx
+	add	r15,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r8,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r8,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r9,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r9,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r11,rax
+	mov	rax,r13
+	adc	r11,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r14,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r14,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r8,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r8,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[32+rbx]
+	add	r9,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r9,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rbx]
+	add	r10,rax
+	mov	rax,r14
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+	DB	0F3h,0C3h		;repret
+__mulq_by_1_mont_384	ENDP
+
+
+ALIGN	32
+__redc_tail_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+	add	r14,QWORD PTR[48+rsi]
+	mov	rax,r14
+	adc	r15,QWORD PTR[56+rsi]
+	adc	r8,QWORD PTR[64+rsi]
+	adc	r9,QWORD PTR[72+rsi]
+	mov	rcx,r15
+	adc	r10,QWORD PTR[80+rsi]
+	adc	r11,QWORD PTR[88+rsi]
+	sbb	r12,r12
+
+
+
+
+	mov	rdx,r8
+	mov	rbp,r9
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	mov	r13,r10
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	mov	rsi,r11
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	r12,0
+
+	cmovc	r14,rax
+	cmovc	r15,rcx
+	cmovc	r8,rdx
+	mov	QWORD PTR[rdi],r14
+	cmovc	r9,rbp
+	mov	QWORD PTR[8+rdi],r15
+	cmovc	r10,r13
+	mov	QWORD PTR[16+rdi],r8
+	cmovc	r11,rsi
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	DB	0F3h,0C3h		;repret
+__redc_tail_mont_384	ENDP
+
+PUBLIC	sgn0_pty_mont_384
+
+
+ALIGN	32
+sgn0_pty_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0_pty_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sgn0_pty_mont_384::
+
+
+	mov	rbx,rsi
+	lea	rsi,QWORD PTR[rdi]
+	mov	rcx,rdx
+	call	__mulq_by_1_mont_384
+
+	xor	rax,rax
+	mov	r13,r14
+	add	r14,r14
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rax,0
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rax,0
+
+	not	rax
+	and	r13,1
+	and	rax,2
+	or	rax,r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sgn0_pty_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0_pty_mont_384::
+sgn0_pty_mont_384	ENDP
+
+PUBLIC	sgn0_pty_mont_384x
+
+
+ALIGN	32
+sgn0_pty_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0_pty_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sgn0_pty_mont_384x::
+
+
+	mov	rbx,rsi
+	lea	rsi,QWORD PTR[48+rdi]
+	mov	rcx,rdx
+	call	__mulq_by_1_mont_384
+
+	mov	r12,r14
+	or	r14,r15
+	or	r14,r8
+	or	r14,r9
+	or	r14,r10
+	or	r14,r11
+
+	lea	rsi,QWORD PTR[rdi]
+	xor	rdi,rdi
+	mov	r13,r12
+	add	r12,r12
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rdi,0
+
+	sub	r12,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rdi,0
+
+	mov	QWORD PTR[rsp],r14
+	not	rdi
+	and	r13,1
+	and	rdi,2
+	or	rdi,r13
+
+	call	__mulq_by_1_mont_384
+
+	mov	r12,r14
+	or	r14,r15
+	or	r14,r8
+	or	r14,r9
+	or	r14,r10
+	or	r14,r11
+
+	xor	rax,rax
+	mov	r13,r12
+	add	r12,r12
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rax,0
+
+	sub	r12,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rax,0
+
+	mov	r12,QWORD PTR[rsp]
+
+	not	rax
+
+	test	r14,r14
+	cmovz	r13,rdi
+
+	test	r12,r12
+	cmovnz	rax,rdi
+
+	and	r13,1
+	and	rax,2
+	or	rax,r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sgn0_pty_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0_pty_mont_384x::
+sgn0_pty_mont_384x	ENDP
+PUBLIC	mul_mont_384
+
+
+ALIGN	32
+mul_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mul_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8*3
+
+$L$SEH_body_mul_mont_384::
+
+
+	mov	rax,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	r12,QWORD PTR[16+rsi]
+	mov	r13,QWORD PTR[24+rsi]
+	mov	rbx,rdx
+	mov	QWORD PTR[rsp],r8
+	mov	QWORD PTR[8+rsp],rdi
+
+	call	__mulq_mont_384
+
+	mov	r15,QWORD PTR[24+rsp]
+
+	mov	r14,QWORD PTR[32+rsp]
+
+	mov	r13,QWORD PTR[40+rsp]
+
+	mov	r12,QWORD PTR[48+rsp]
+
+	mov	rbx,QWORD PTR[56+rsp]
+
+	mov	rbp,QWORD PTR[64+rsp]
+
+	lea	rsp,QWORD PTR[72+rsp]
+
+$L$SEH_epilogue_mul_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mul_mont_384::
+mul_mont_384	ENDP
+
+ALIGN	32
+__mulq_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rdi,rax
+	mul	r14
+	mov	r8,rax
+	mov	rax,rdi
+	mov	r9,rdx
+
+	mul	r15
+	add	r9,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	r12
+	add	r10,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r11,rdx
+
+	mov	rbp,r8
+	imul	r8,QWORD PTR[8+rsp]
+
+	mul	r13
+	add	r11,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r12,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r13,rax
+	mov	rax,r8
+	adc	rdx,0
+	xor	r15,r15
+	mov	r14,rdx
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r8
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r9,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r9,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r10,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r11,rbp
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r13,rax
+	mov	rax,QWORD PTR[8+rbx]
+	adc	rdx,0
+	add	r13,rbp
+	adc	r14,rdx
+	adc	r15,0
+
+	mov	rdi,rax
+	mul	QWORD PTR[rsi]
+	add	r9,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r10,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r10,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r11,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r11,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mov	rbp,r9
+	imul	r9,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r12,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r12,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r13,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r13,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r14,r8
+	adc	rdx,0
+	xor	r8,r8
+	add	r14,rax
+	mov	rax,r9
+	adc	r15,rdx
+	adc	r8,0
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r9
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r10,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r10,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r11,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r12,rbp
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,r9
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r14,rax
+	mov	rax,QWORD PTR[16+rbx]
+	adc	rdx,0
+	add	r14,rbp
+	adc	r15,rdx
+	adc	r8,0
+
+	mov	rdi,rax
+	mul	QWORD PTR[rsi]
+	add	r10,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r11,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r11,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r12,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r12,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mov	rbp,r10
+	imul	r10,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r13,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r13,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r14,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r14,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r15,r9
+	adc	rdx,0
+	xor	r9,r9
+	add	r15,rax
+	mov	rax,r10
+	adc	r8,rdx
+	adc	r9,0
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r10
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r13,rbp
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r15,rax
+	mov	rax,QWORD PTR[24+rbx]
+	adc	rdx,0
+	add	r15,rbp
+	adc	r8,rdx
+	adc	r9,0
+
+	mov	rdi,rax
+	mul	QWORD PTR[rsi]
+	add	r11,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r12,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r12,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r13,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mov	rbp,r11
+	imul	r11,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r14,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r14,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r15,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r15,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r8,r10
+	adc	rdx,0
+	xor	r10,r10
+	add	r8,rax
+	mov	rax,r11
+	adc	r9,rdx
+	adc	r10,0
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r11
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r14,rbp
+	adc	rdx,0
+	add	r14,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r15,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r15,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r8,rax
+	mov	rax,QWORD PTR[32+rbx]
+	adc	rdx,0
+	add	r8,rbp
+	adc	r9,rdx
+	adc	r10,0
+
+	mov	rdi,rax
+	mul	QWORD PTR[rsi]
+	add	r12,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r13,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r14,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r14,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mov	rbp,r12
+	imul	r12,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r15,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r15,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r8,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r8,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r9,r11
+	adc	rdx,0
+	xor	r11,r11
+	add	r9,rax
+	mov	rax,r12
+	adc	r10,rdx
+	adc	r11,0
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r12
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r13,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r13,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r15,rbp
+	adc	rdx,0
+	add	r15,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r8,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r8,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r9,rax
+	mov	rax,QWORD PTR[40+rbx]
+	adc	rdx,0
+	add	r9,rbp
+	adc	r10,rdx
+	adc	r11,0
+
+	mov	rdi,rax
+	mul	QWORD PTR[rsi]
+	add	r13,rax
+	mov	rax,rdi
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r14,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r14,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r15,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r15,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mov	rbp,r13
+	imul	r13,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r8,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r8,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r9,rax
+	mov	rax,rdi
+	adc	rdx,0
+	add	r9,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r10,r12
+	adc	rdx,0
+	xor	r12,r12
+	add	r10,rax
+	mov	rax,r13
+	adc	r11,rdx
+	adc	r12,0
+
+	mul	QWORD PTR[rcx]
+	add	rbp,rax
+	mov	rax,r13
+	adc	rbp,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r14,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r14,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r8,rbp
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r9,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r9,rbp
+	adc	rdx,0
+	mov	rbp,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r10,rax
+	mov	rax,r14
+	adc	rdx,0
+	add	r10,rbp
+	adc	r11,rdx
+	adc	r12,0
+
+
+
+
+	mov	rdi,QWORD PTR[16+rsp]
+	sub	r14,QWORD PTR[rcx]
+	mov	rdx,r15
+	sbb	r15,QWORD PTR[8+rcx]
+	mov	rbx,r8
+	sbb	r8,QWORD PTR[16+rcx]
+	mov	rsi,r9
+	sbb	r9,QWORD PTR[24+rcx]
+	mov	rbp,r10
+	sbb	r10,QWORD PTR[32+rcx]
+	mov	r13,r11
+	sbb	r11,QWORD PTR[40+rcx]
+	sbb	r12,0
+
+	cmovc	r14,rax
+	cmovc	r15,rdx
+	cmovc	r8,rbx
+	mov	QWORD PTR[rdi],r14
+	cmovc	r9,rsi
+	mov	QWORD PTR[8+rdi],r15
+	cmovc	r10,rbp
+	mov	QWORD PTR[16+rdi],r8
+	cmovc	r11,r13
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	DB	0F3h,0C3h		;repret
+__mulq_mont_384	ENDP
+PUBLIC	sqr_n_mul_mont_384
+
+
+ALIGN	32
+sqr_n_mul_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_n_mul_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+	mov	r9,QWORD PTR[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8*17
+
+$L$SEH_body_sqr_n_mul_mont_384::
+
+
+	mov	QWORD PTR[rsp],r8
+	mov	QWORD PTR[8+rsp],rdi
+	mov	QWORD PTR[16+rsp],rcx
+	lea	rdi,QWORD PTR[32+rsp]
+	mov	QWORD PTR[24+rsp],r9
+	movq	xmm2,QWORD PTR[r9]
+
+$L$oop_sqr_384::
+	movd	xmm1,edx
+
+	call	__sqrq_384
+
+	lea	rsi,QWORD PTR[rdi]
+	mov	rcx,QWORD PTR[rsp]
+	mov	rbx,QWORD PTR[16+rsp]
+	call	__mulq_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	movd	edx,xmm1
+	lea	rsi,QWORD PTR[rdi]
+	dec	edx
+	jnz	$L$oop_sqr_384
+
+DB	102,72,15,126,208
+	mov	rcx,rbx
+	mov	rbx,QWORD PTR[24+rsp]
+
+
+
+
+
+
+	mov	r12,r8
+	mov	r13,r9
+
+	call	__mulq_mont_384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[136+rsp]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqr_n_mul_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_n_mul_mont_384::
+sqr_n_mul_mont_384	ENDP
+
+PUBLIC	sqr_n_mul_mont_383
+
+
+ALIGN	32
+sqr_n_mul_mont_383	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_n_mul_mont_383::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+	mov	r9,QWORD PTR[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8*17
+
+$L$SEH_body_sqr_n_mul_mont_383::
+
+
+	mov	QWORD PTR[rsp],r8
+	mov	QWORD PTR[8+rsp],rdi
+	mov	QWORD PTR[16+rsp],rcx
+	lea	rdi,QWORD PTR[32+rsp]
+	mov	QWORD PTR[24+rsp],r9
+	movq	xmm2,QWORD PTR[r9]
+
+$L$oop_sqr_383::
+	movd	xmm1,edx
+
+	call	__sqrq_384
+
+	lea	rsi,QWORD PTR[rdi]
+	mov	rcx,QWORD PTR[rsp]
+	mov	rbx,QWORD PTR[16+rsp]
+	call	__mulq_by_1_mont_384
+
+	movd	edx,xmm1
+	add	r14,QWORD PTR[48+rsi]
+	adc	r15,QWORD PTR[56+rsi]
+	adc	r8,QWORD PTR[64+rsi]
+	adc	r9,QWORD PTR[72+rsi]
+	adc	r10,QWORD PTR[80+rsi]
+	adc	r11,QWORD PTR[88+rsi]
+	lea	rsi,QWORD PTR[rdi]
+
+	mov	QWORD PTR[rdi],r14
+	mov	QWORD PTR[8+rdi],r15
+	mov	QWORD PTR[16+rdi],r8
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	dec	edx
+	jnz	$L$oop_sqr_383
+
+DB	102,72,15,126,208
+	mov	rcx,rbx
+	mov	rbx,QWORD PTR[24+rsp]
+
+
+
+
+
+
+	mov	r12,r8
+	mov	r13,r9
+
+	call	__mulq_mont_384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[136+rsp]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqr_n_mul_mont_383::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_n_mul_mont_383::
+sqr_n_mul_mont_383	ENDP
+
+ALIGN	32
+__mulq_mont_383_nonred	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rbp,rax
+	mul	r14
+	mov	r8,rax
+	mov	rax,rbp
+	mov	r9,rdx
+
+	mul	r15
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	r12
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mov	r15,r8
+	imul	r8,QWORD PTR[8+rsp]
+
+	mul	r13
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r13,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r13,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r15,rax
+	mov	rax,r8
+	adc	r15,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r9,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r9,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r10,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r10,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r11,r15
+	adc	rdx,0
+	add	r11,rax
+	mov	rax,r8
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r12,rax
+	mov	rax,r8
+	adc	rdx,0
+	add	r12,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r13,rax
+	mov	rax,QWORD PTR[8+rbx]
+	adc	rdx,0
+	add	r13,r15
+	adc	r14,rdx
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r10,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r11,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mov	r8,r9
+	imul	r9,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r12,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r13,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r14,r15
+	adc	rdx,0
+	add	r14,rax
+	mov	rax,r9
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r8,rax
+	mov	rax,r9
+	adc	r8,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r10,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r10,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r11,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r11,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r12,r8
+	adc	rdx,0
+	add	r12,rax
+	mov	rax,r9
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r13,rax
+	mov	rax,r9
+	adc	rdx,0
+	add	r13,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r14,rax
+	mov	rax,QWORD PTR[16+rbx]
+	adc	rdx,0
+	add	r14,r8
+	adc	r15,rdx
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	r10,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r11,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r12,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mov	r9,r10
+	imul	r10,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r13,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r14,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r14,r8
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r15,r8
+	adc	rdx,0
+	add	r15,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r8,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r9,rax
+	mov	rax,r10
+	adc	r9,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r13,r9
+	adc	rdx,0
+	add	r13,rax
+	mov	rax,r10
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r14,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r14,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r15,rax
+	mov	rax,QWORD PTR[24+rbx]
+	adc	rdx,0
+	add	r15,r9
+	adc	r8,rdx
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	r11,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r12,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r13,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mov	r10,r11
+	imul	r11,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r14,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r14,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r15,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r15,r9
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r8,r9
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r9,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r10,rax
+	mov	rax,r11
+	adc	r10,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r14,r10
+	adc	rdx,0
+	add	r14,rax
+	mov	rax,r11
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r15,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r15,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r8,rax
+	mov	rax,QWORD PTR[32+rbx]
+	adc	rdx,0
+	add	r8,r10
+	adc	r9,rdx
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	r12,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r14,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r14,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mov	r11,r12
+	imul	r12,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r15,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r15,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r9,r10
+	adc	rdx,0
+	add	r9,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r11,rax
+	mov	rax,r12
+	adc	r11,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r13,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r13,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r15,r11
+	adc	rdx,0
+	add	r15,rax
+	mov	rax,r12
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r8,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r8,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r9,rax
+	mov	rax,QWORD PTR[40+rbx]
+	adc	rdx,0
+	add	r9,r11
+	adc	r10,rdx
+
+	mov	rbp,rax
+	mul	QWORD PTR[rsi]
+	add	r13,rax
+	mov	rax,rbp
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[8+rsi]
+	add	r14,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r14,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[16+rsi]
+	add	r15,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r15,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mov	r12,r13
+	imul	r13,QWORD PTR[8+rsp]
+
+	mul	QWORD PTR[24+rsi]
+	add	r8,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r8,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[32+rsi]
+	add	r9,rax
+	mov	rax,rbp
+	adc	rdx,0
+	add	r9,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[40+rsi]
+	add	r10,r11
+	adc	rdx,0
+	add	r10,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[rcx]
+	add	r12,rax
+	mov	rax,r13
+	adc	r12,rdx
+
+	mul	QWORD PTR[8+rcx]
+	add	r14,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r14,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[16+rcx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[24+rcx]
+	add	r8,r12
+	adc	rdx,0
+	add	r8,rax
+	mov	rax,r13
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[32+rcx]
+	add	r9,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r9,r12
+	adc	rdx,0
+	mov	r12,rdx
+
+	mul	QWORD PTR[40+rcx]
+	add	r10,rax
+	mov	rax,r14
+	adc	rdx,0
+	add	r10,r12
+	adc	r11,rdx
+	DB	0F3h,0C3h		;repret
+__mulq_mont_383_nonred	ENDP
+PUBLIC	sqr_mont_382x
+
+
+ALIGN	32
+sqr_mont_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqr_mont_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_sqr_mont_382x::
+
+
+	mov	QWORD PTR[rsp],rcx
+	mov	rcx,rdx
+	mov	QWORD PTR[16+rsp],rsi
+	mov	QWORD PTR[24+rsp],rdi
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,r8
+	add	r8,QWORD PTR[48+rsi]
+	mov	r15,r9
+	adc	r9,QWORD PTR[56+rsi]
+	mov	rax,r10
+	adc	r10,QWORD PTR[64+rsi]
+	mov	rdx,r11
+	adc	r11,QWORD PTR[72+rsi]
+	mov	rbx,r12
+	adc	r12,QWORD PTR[80+rsi]
+	mov	rbp,r13
+	adc	r13,QWORD PTR[88+rsi]
+
+	sub	r14,QWORD PTR[48+rsi]
+	sbb	r15,QWORD PTR[56+rsi]
+	sbb	rax,QWORD PTR[64+rsi]
+	sbb	rdx,QWORD PTR[72+rsi]
+	sbb	rbx,QWORD PTR[80+rsi]
+	sbb	rbp,QWORD PTR[88+rsi]
+	sbb	rdi,rdi
+
+	mov	QWORD PTR[((32+0))+rsp],r8
+	mov	QWORD PTR[((32+8))+rsp],r9
+	mov	QWORD PTR[((32+16))+rsp],r10
+	mov	QWORD PTR[((32+24))+rsp],r11
+	mov	QWORD PTR[((32+32))+rsp],r12
+	mov	QWORD PTR[((32+40))+rsp],r13
+
+	mov	QWORD PTR[((32+48))+rsp],r14
+	mov	QWORD PTR[((32+56))+rsp],r15
+	mov	QWORD PTR[((32+64))+rsp],rax
+	mov	QWORD PTR[((32+72))+rsp],rdx
+	mov	QWORD PTR[((32+80))+rsp],rbx
+	mov	QWORD PTR[((32+88))+rsp],rbp
+	mov	QWORD PTR[((32+96))+rsp],rdi
+
+
+
+	lea	rbx,QWORD PTR[48+rsi]
+
+	mov	rax,QWORD PTR[48+rsi]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	r12,QWORD PTR[16+rsi]
+	mov	r13,QWORD PTR[24+rsi]
+
+	mov	rdi,QWORD PTR[24+rsp]
+	call	__mulq_mont_383_nonred
+	add	r14,r14
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+
+	mov	QWORD PTR[48+rdi],r14
+	mov	QWORD PTR[56+rdi],r15
+	mov	QWORD PTR[64+rdi],r8
+	mov	QWORD PTR[72+rdi],r9
+	mov	QWORD PTR[80+rdi],r10
+	mov	QWORD PTR[88+rdi],r11
+
+	lea	rsi,QWORD PTR[32+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+
+	mov	rax,QWORD PTR[((32+48))+rsp]
+	mov	r14,QWORD PTR[((32+0))+rsp]
+	mov	r15,QWORD PTR[((32+8))+rsp]
+	mov	r12,QWORD PTR[((32+16))+rsp]
+	mov	r13,QWORD PTR[((32+24))+rsp]
+
+	call	__mulq_mont_383_nonred
+	mov	rsi,QWORD PTR[((32+96))+rsp]
+	mov	r12,QWORD PTR[((32+0))+rsp]
+	mov	r13,QWORD PTR[((32+8))+rsp]
+	and	r12,rsi
+	mov	rax,QWORD PTR[((32+16))+rsp]
+	and	r13,rsi
+	mov	rbx,QWORD PTR[((32+24))+rsp]
+	and	rax,rsi
+	mov	rbp,QWORD PTR[((32+32))+rsp]
+	and	rbx,rsi
+	and	rbp,rsi
+	and	rsi,QWORD PTR[((32+40))+rsp]
+
+	sub	r14,r12
+	mov	r12,QWORD PTR[rcx]
+	sbb	r15,r13
+	mov	r13,QWORD PTR[8+rcx]
+	sbb	r8,rax
+	mov	rax,QWORD PTR[16+rcx]
+	sbb	r9,rbx
+	mov	rbx,QWORD PTR[24+rcx]
+	sbb	r10,rbp
+	mov	rbp,QWORD PTR[32+rcx]
+	sbb	r11,rsi
+	sbb	rsi,rsi
+
+	and	r12,rsi
+	and	r13,rsi
+	and	rax,rsi
+	and	rbx,rsi
+	and	rbp,rsi
+	and	rsi,QWORD PTR[40+rcx]
+
+	add	r14,r12
+	adc	r15,r13
+	adc	r8,rax
+	adc	r9,rbx
+	adc	r10,rbp
+	adc	r11,rsi
+
+	mov	QWORD PTR[rdi],r14
+	mov	QWORD PTR[8+rdi],r15
+	mov	QWORD PTR[16+rdi],r8
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqr_mont_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqr_mont_382x::
+sqr_mont_382x	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_mul_mont_384x
+	DD	imagerel $L$SEH_body_mul_mont_384x
+	DD	imagerel $L$SEH_info_mul_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_mul_mont_384x
+	DD	imagerel $L$SEH_epilogue_mul_mont_384x
+	DD	imagerel $L$SEH_info_mul_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_mul_mont_384x
+	DD	imagerel $L$SEH_end_mul_mont_384x
+	DD	imagerel $L$SEH_info_mul_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_mont_384x
+	DD	imagerel $L$SEH_body_sqr_mont_384x
+	DD	imagerel $L$SEH_info_sqr_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_sqr_mont_384x
+	DD	imagerel $L$SEH_epilogue_sqr_mont_384x
+	DD	imagerel $L$SEH_info_sqr_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_mont_384x
+	DD	imagerel $L$SEH_end_sqr_mont_384x
+	DD	imagerel $L$SEH_info_sqr_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_382x
+	DD	imagerel $L$SEH_body_mul_382x
+	DD	imagerel $L$SEH_info_mul_382x_prologue
+
+	DD	imagerel $L$SEH_body_mul_382x
+	DD	imagerel $L$SEH_epilogue_mul_382x
+	DD	imagerel $L$SEH_info_mul_382x_body
+
+	DD	imagerel $L$SEH_epilogue_mul_382x
+	DD	imagerel $L$SEH_end_mul_382x
+	DD	imagerel $L$SEH_info_mul_382x_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_382x
+	DD	imagerel $L$SEH_body_sqr_382x
+	DD	imagerel $L$SEH_info_sqr_382x_prologue
+
+	DD	imagerel $L$SEH_body_sqr_382x
+	DD	imagerel $L$SEH_epilogue_sqr_382x
+	DD	imagerel $L$SEH_info_sqr_382x_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_382x
+	DD	imagerel $L$SEH_end_sqr_382x
+	DD	imagerel $L$SEH_info_sqr_382x_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_384
+	DD	imagerel $L$SEH_body_mul_384
+	DD	imagerel $L$SEH_info_mul_384_prologue
+
+	DD	imagerel $L$SEH_body_mul_384
+	DD	imagerel $L$SEH_epilogue_mul_384
+	DD	imagerel $L$SEH_info_mul_384_body
+
+	DD	imagerel $L$SEH_epilogue_mul_384
+	DD	imagerel $L$SEH_end_mul_384
+	DD	imagerel $L$SEH_info_mul_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_384
+	DD	imagerel $L$SEH_body_sqr_384
+	DD	imagerel $L$SEH_info_sqr_384_prologue
+
+	DD	imagerel $L$SEH_body_sqr_384
+	DD	imagerel $L$SEH_epilogue_sqr_384
+	DD	imagerel $L$SEH_info_sqr_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_384
+	DD	imagerel $L$SEH_end_sqr_384
+	DD	imagerel $L$SEH_info_sqr_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_mont_384
+	DD	imagerel $L$SEH_body_sqr_mont_384
+	DD	imagerel $L$SEH_info_sqr_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sqr_mont_384
+	DD	imagerel $L$SEH_epilogue_sqr_mont_384
+	DD	imagerel $L$SEH_info_sqr_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_mont_384
+	DD	imagerel $L$SEH_end_sqr_mont_384
+	DD	imagerel $L$SEH_info_sqr_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_redc_mont_384
+	DD	imagerel $L$SEH_body_redc_mont_384
+	DD	imagerel $L$SEH_info_redc_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_redc_mont_384
+	DD	imagerel $L$SEH_epilogue_redc_mont_384
+	DD	imagerel $L$SEH_info_redc_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_redc_mont_384
+	DD	imagerel $L$SEH_end_redc_mont_384
+	DD	imagerel $L$SEH_info_redc_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_from_mont_384
+	DD	imagerel $L$SEH_body_from_mont_384
+	DD	imagerel $L$SEH_info_from_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_from_mont_384
+	DD	imagerel $L$SEH_epilogue_from_mont_384
+	DD	imagerel $L$SEH_info_from_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_from_mont_384
+	DD	imagerel $L$SEH_end_from_mont_384
+	DD	imagerel $L$SEH_info_from_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_body_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_end_sgn0_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_body_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_end_sgn0_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mul_mont_384
+	DD	imagerel $L$SEH_body_mul_mont_384
+	DD	imagerel $L$SEH_info_mul_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_mul_mont_384
+	DD	imagerel $L$SEH_epilogue_mul_mont_384
+	DD	imagerel $L$SEH_info_mul_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_mul_mont_384
+	DD	imagerel $L$SEH_end_mul_mont_384
+	DD	imagerel $L$SEH_info_mul_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_body_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_epilogue_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_end_sqr_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_body_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue
+
+	DD	imagerel $L$SEH_body_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_epilogue_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_383_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_end_sqr_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue
+
+	DD	imagerel $L$SEH_begin_sqr_mont_382x
+	DD	imagerel $L$SEH_body_sqr_mont_382x
+	DD	imagerel $L$SEH_info_sqr_mont_382x_prologue
+
+	DD	imagerel $L$SEH_body_sqr_mont_382x
+	DD	imagerel $L$SEH_epilogue_sqr_mont_382x
+	DD	imagerel $L$SEH_info_sqr_mont_382x_body
+
+	DD	imagerel $L$SEH_epilogue_sqr_mont_382x
+	DD	imagerel $L$SEH_end_sqr_mont_382x
+	DD	imagerel $L$SEH_info_sqr_mont_382x_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_mul_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_mont_384x_body::
+DB	1,0,18,0
+DB	000h,0f4h,029h,000h
+DB	000h,0e4h,02ah,000h
+DB	000h,0d4h,02bh,000h
+DB	000h,0c4h,02ch,000h
+DB	000h,034h,02dh,000h
+DB	000h,054h,02eh,000h
+DB	000h,074h,030h,000h
+DB	000h,064h,031h,000h
+DB	000h,001h,02fh,000h
+$L$SEH_info_mul_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_mont_384x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqr_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_382x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_mul_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_382x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqr_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_384_body::
+DB	1,0,11,0
+DB	000h,0c4h,000h,000h
+DB	000h,034h,001h,000h
+DB	000h,054h,002h,000h
+DB	000h,074h,004h,000h
+DB	000h,064h,005h,000h
+DB	000h,022h
+DB	000h,000h,000h,000h,000h,000h
+$L$SEH_info_mul_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqr_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_mont_384_body::
+DB	1,0,18,0
+DB	000h,0f4h,00fh,000h
+DB	000h,0e4h,010h,000h
+DB	000h,0d4h,011h,000h
+DB	000h,0c4h,012h,000h
+DB	000h,034h,013h,000h
+DB	000h,054h,014h,000h
+DB	000h,074h,016h,000h
+DB	000h,064h,017h,000h
+DB	000h,001h,015h,000h
+$L$SEH_info_sqr_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_redc_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_redc_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_redc_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_from_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_from_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_from_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0_pty_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0_pty_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sgn0_pty_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0_pty_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0_pty_mont_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sgn0_pty_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mul_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mul_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,003h,000h
+DB	000h,0e4h,004h,000h
+DB	000h,0d4h,005h,000h
+DB	000h,0c4h,006h,000h
+DB	000h,034h,007h,000h
+DB	000h,054h,008h,000h
+DB	000h,074h,00ah,000h
+DB	000h,064h,00bh,000h
+DB	000h,082h
+DB	000h,000h
+$L$SEH_info_mul_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_n_mul_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_n_mul_mont_384_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqr_n_mul_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_n_mul_mont_383_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_n_mul_mont_383_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqr_n_mul_mont_383_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqr_mont_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqr_mont_382x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqr_mont_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
new file mode 100644
index 00000000000..83534c629e9
--- /dev/null
+++ b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
@@ -0,0 +1,796 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+PUBLIC	mulx_mont_sparse_256
+
+
+ALIGN	32
+mulx_mont_sparse_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mulx_mont_sparse_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_mulx_mont_sparse_256::
+
+
+	mov	rbx,rdx
+	mov	rdx,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rbp,QWORD PTR[16+rsi]
+	mov	r9,QWORD PTR[24+rsi]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r11,rax,r14
+	call	__mulx_mont_sparse_256
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_mulx_mont_sparse_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mulx_mont_sparse_256::
+mulx_mont_sparse_256	ENDP
+
+PUBLIC	sqrx_mont_sparse_256
+
+
+ALIGN	32
+sqrx_mont_sparse_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_mont_sparse_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sqrx_mont_sparse_256::
+
+
+	mov	rbx,rsi
+	mov	r8,rcx
+	mov	rcx,rdx
+	mov	rdx,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rbp,QWORD PTR[16+rsi]
+	mov	r9,QWORD PTR[24+rsi]
+	lea	rsi,QWORD PTR[((-128))+rbx]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r11,rax,rdx
+	call	__mulx_mont_sparse_256
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqrx_mont_sparse_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_mont_sparse_256::
+sqrx_mont_sparse_256	ENDP
+
+ALIGN	32
+__mulx_mont_sparse_256	PROC PRIVATE
+	DB	243,15,30,250
+	mulx	r12,r15,r15
+	mulx	r13,rbp,rbp
+	add	r11,r15
+	mulx	r14,r9,r9
+	mov	rdx,QWORD PTR[8+rbx]
+	adc	r12,rbp
+	adc	r13,r9
+	adc	r14,0
+
+	mov	r10,rax
+	imul	rax,r8
+
+
+	xor	r15,r15
+	mulx	r9,rbp,QWORD PTR[((0+128))+rsi]
+	adox	r11,rbp
+	adcx	r12,r9
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rsi]
+	adox	r12,rbp
+	adcx	r13,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rsi]
+	adox	r13,rbp
+	adcx	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rsi]
+	mov	rdx,rax
+	adox	r14,rbp
+	adcx	r9,r15
+	adox	r15,r9
+
+
+	mulx	rax,rbp,QWORD PTR[((0+128))+rcx]
+	adcx	r10,rbp
+	adox	rax,r11
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rcx]
+	adcx	rax,rbp
+	adox	r12,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rcx]
+	adcx	r12,rbp
+	adox	r13,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rcx]
+	mov	rdx,QWORD PTR[16+rbx]
+	adcx	r13,rbp
+	adox	r14,r9
+	adcx	r14,r10
+	adox	r15,r10
+	adcx	r15,r10
+	adox	r10,r10
+	adc	r10,0
+	mov	r11,rax
+	imul	rax,r8
+
+
+	xor	rbp,rbp
+	mulx	r9,rbp,QWORD PTR[((0+128))+rsi]
+	adox	r12,rbp
+	adcx	r13,r9
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rsi]
+	adox	r13,rbp
+	adcx	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rsi]
+	adox	r14,rbp
+	adcx	r15,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rsi]
+	mov	rdx,rax
+	adox	r15,rbp
+	adcx	r9,r10
+	adox	r10,r9
+
+
+	mulx	rax,rbp,QWORD PTR[((0+128))+rcx]
+	adcx	r11,rbp
+	adox	rax,r12
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rcx]
+	adcx	rax,rbp
+	adox	r13,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rcx]
+	adcx	r13,rbp
+	adox	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rcx]
+	mov	rdx,QWORD PTR[24+rbx]
+	adcx	r14,rbp
+	adox	r15,r9
+	adcx	r15,r11
+	adox	r10,r11
+	adcx	r10,r11
+	adox	r11,r11
+	adc	r11,0
+	mov	r12,rax
+	imul	rax,r8
+
+
+	xor	rbp,rbp
+	mulx	r9,rbp,QWORD PTR[((0+128))+rsi]
+	adox	r13,rbp
+	adcx	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rsi]
+	adox	r14,rbp
+	adcx	r15,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rsi]
+	adox	r15,rbp
+	adcx	r10,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rsi]
+	mov	rdx,rax
+	adox	r10,rbp
+	adcx	r9,r11
+	adox	r11,r9
+
+
+	mulx	rax,rbp,QWORD PTR[((0+128))+rcx]
+	adcx	r12,rbp
+	adox	rax,r13
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rcx]
+	adcx	rax,rbp
+	adox	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rcx]
+	adcx	r14,rbp
+	adox	r15,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rcx]
+	mov	rdx,rax
+	adcx	r15,rbp
+	adox	r10,r9
+	adcx	r10,r12
+	adox	r11,r12
+	adcx	r11,r12
+	adox	r12,r12
+	adc	r12,0
+	imul	rdx,r8
+
+
+	xor	rbp,rbp
+	mulx	r9,r13,QWORD PTR[((0+128))+rcx]
+	adcx	r13,rax
+	adox	r14,r9
+
+	mulx	r9,rbp,QWORD PTR[((8+128))+rcx]
+	adcx	r14,rbp
+	adox	r15,r9
+
+	mulx	r9,rbp,QWORD PTR[((16+128))+rcx]
+	adcx	r15,rbp
+	adox	r10,r9
+
+	mulx	r9,rbp,QWORD PTR[((24+128))+rcx]
+	mov	rdx,r14
+	lea	rcx,QWORD PTR[128+rcx]
+	adcx	r10,rbp
+	adox	r11,r9
+	mov	rax,r15
+	adcx	r11,r13
+	adox	r12,r13
+	adc	r12,0
+
+
+
+
+	mov	rbp,r10
+	sub	r14,QWORD PTR[rcx]
+	sbb	r15,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rcx]
+	mov	r9,r11
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,0
+
+	cmovc	r14,rdx
+	cmovc	r15,rax
+	cmovc	r10,rbp
+	mov	QWORD PTR[rdi],r14
+	cmovc	r11,r9
+	mov	QWORD PTR[8+rdi],r15
+	mov	QWORD PTR[16+rdi],r10
+	mov	QWORD PTR[24+rdi],r11
+
+	DB	0F3h,0C3h		;repret
+__mulx_mont_sparse_256	ENDP
+PUBLIC	fromx_mont_256
+
+
+ALIGN	32
+fromx_mont_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_fromx_mont_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_fromx_mont_256::
+
+
+	mov	rbx,rdx
+	call	__mulx_by_1_mont_256
+
+
+
+
+
+	mov	rdx,r15
+	mov	r12,r10
+	mov	r13,r11
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r10,QWORD PTR[16+rbx]
+	sbb	r11,QWORD PTR[24+rbx]
+
+	cmovnc	rax,r14
+	cmovnc	rdx,r15
+	cmovnc	r12,r10
+	mov	QWORD PTR[rdi],rax
+	cmovnc	r13,r11
+	mov	QWORD PTR[8+rdi],rdx
+	mov	QWORD PTR[16+rdi],r12
+	mov	QWORD PTR[24+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_fromx_mont_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_fromx_mont_256::
+fromx_mont_256	ENDP
+
+PUBLIC	redcx_mont_256
+
+
+ALIGN	32
+redcx_mont_256	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_redcx_mont_256::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_redcx_mont_256::
+
+
+	mov	rbx,rdx
+	call	__mulx_by_1_mont_256
+
+	add	r14,QWORD PTR[32+rsi]
+	adc	r15,QWORD PTR[40+rsi]
+	mov	rax,r14
+	adc	r10,QWORD PTR[48+rsi]
+	mov	rdx,r15
+	adc	r11,QWORD PTR[56+rsi]
+	sbb	rsi,rsi
+
+
+
+
+	mov	r12,r10
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r10,QWORD PTR[16+rbx]
+	mov	r13,r11
+	sbb	r11,QWORD PTR[24+rbx]
+	sbb	rsi,0
+
+	cmovnc	rax,r14
+	cmovnc	rdx,r15
+	cmovnc	r12,r10
+	mov	QWORD PTR[rdi],rax
+	cmovnc	r13,r11
+	mov	QWORD PTR[8+rdi],rdx
+	mov	QWORD PTR[16+rdi],r12
+	mov	QWORD PTR[24+rdi],r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_redcx_mont_256::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_redcx_mont_256::
+redcx_mont_256	ENDP
+
+ALIGN	32
+__mulx_by_1_mont_256	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rax,QWORD PTR[rsi]
+	mov	r11,QWORD PTR[8+rsi]
+	mov	r12,QWORD PTR[16+rsi]
+	mov	r13,QWORD PTR[24+rsi]
+
+	mov	r14,rax
+	imul	rax,rcx
+	mov	r10,rax
+
+	mul	QWORD PTR[rbx]
+	add	r14,rax
+	mov	rax,r10
+	adc	r14,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r11,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r11,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r15,r11
+	imul	r11,rcx
+	add	r12,rax
+	mov	rax,r10
+	adc	rdx,0
+	add	r12,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r14
+	adc	rdx,0
+	mov	r14,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r15,rax
+	mov	rax,r11
+	adc	r15,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r12,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r12,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r10,r12
+	imul	r12,rcx
+	add	r13,rax
+	mov	rax,r11
+	adc	rdx,0
+	add	r13,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r15
+	adc	rdx,0
+	mov	r15,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r10,rax
+	mov	rax,r12
+	adc	r10,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r13,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r13,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[16+rbx]
+	mov	r11,r13
+	imul	r13,rcx
+	add	r14,rax
+	mov	rax,r12
+	adc	rdx,0
+	add	r14,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,r10
+	adc	rdx,0
+	mov	r10,rdx
+
+	mul	QWORD PTR[rbx]
+	add	r11,rax
+	mov	rax,r13
+	adc	r11,rdx
+
+	mul	QWORD PTR[8+rbx]
+	add	r14,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r14,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[16+rbx]
+	add	r15,rax
+	mov	rax,r13
+	adc	rdx,0
+	add	r15,r11
+	adc	rdx,0
+	mov	r11,rdx
+
+	mul	QWORD PTR[24+rbx]
+	add	r10,rax
+	mov	rax,r14
+	adc	rdx,0
+	add	r10,r11
+	adc	rdx,0
+	mov	r11,rdx
+	DB	0F3h,0C3h		;repret
+__mulx_by_1_mont_256	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_body_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_info_mulx_mont_sparse_256_prologue
+
+	DD	imagerel $L$SEH_body_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_epilogue_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_info_mulx_mont_sparse_256_body
+
+	DD	imagerel $L$SEH_epilogue_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_end_mulx_mont_sparse_256
+	DD	imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_body_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqrx_mont_sparse_256_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_end_sqrx_mont_sparse_256
+	DD	imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue
+
+	DD	imagerel $L$SEH_begin_fromx_mont_256
+	DD	imagerel $L$SEH_body_fromx_mont_256
+	DD	imagerel $L$SEH_info_fromx_mont_256_prologue
+
+	DD	imagerel $L$SEH_body_fromx_mont_256
+	DD	imagerel $L$SEH_epilogue_fromx_mont_256
+	DD	imagerel $L$SEH_info_fromx_mont_256_body
+
+	DD	imagerel $L$SEH_epilogue_fromx_mont_256
+	DD	imagerel $L$SEH_end_fromx_mont_256
+	DD	imagerel $L$SEH_info_fromx_mont_256_epilogue
+
+	DD	imagerel $L$SEH_begin_redcx_mont_256
+	DD	imagerel $L$SEH_body_redcx_mont_256
+	DD	imagerel $L$SEH_info_redcx_mont_256_prologue
+
+	DD	imagerel $L$SEH_body_redcx_mont_256
+	DD	imagerel $L$SEH_epilogue_redcx_mont_256
+	DD	imagerel $L$SEH_info_redcx_mont_256_body
+
+	DD	imagerel $L$SEH_epilogue_redcx_mont_256
+	DD	imagerel $L$SEH_end_redcx_mont_256
+	DD	imagerel $L$SEH_info_redcx_mont_256_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_mulx_mont_sparse_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mulx_mont_sparse_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_mulx_mont_sparse_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_mont_sparse_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_mont_sparse_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqrx_mont_sparse_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_fromx_mont_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_fromx_mont_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_fromx_mont_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_redcx_mont_256_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_redcx_mont_256_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_redcx_mont_256_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
new file mode 100644
index 00000000000..25bee97731b
--- /dev/null
+++ b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
@@ -0,0 +1,3586 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+
+
+
+
+
+
+
+ALIGN	32
+__sub_mod_384x384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	mov	r14,QWORD PTR[48+rsi]
+
+	sub	r8,QWORD PTR[rdx]
+	mov	r15,QWORD PTR[56+rsi]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	rax,QWORD PTR[64+rsi]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rbx,QWORD PTR[72+rsi]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbp,QWORD PTR[80+rsi]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rsi,QWORD PTR[88+rsi]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	QWORD PTR[rdi],r8
+	sbb	r14,QWORD PTR[48+rdx]
+	mov	r8,QWORD PTR[rcx]
+	mov	QWORD PTR[8+rdi],r9
+	sbb	r15,QWORD PTR[56+rdx]
+	mov	r9,QWORD PTR[8+rcx]
+	mov	QWORD PTR[16+rdi],r10
+	sbb	rax,QWORD PTR[64+rdx]
+	mov	r10,QWORD PTR[16+rcx]
+	mov	QWORD PTR[24+rdi],r11
+	sbb	rbx,QWORD PTR[72+rdx]
+	mov	r11,QWORD PTR[24+rcx]
+	mov	QWORD PTR[32+rdi],r12
+	sbb	rbp,QWORD PTR[80+rdx]
+	mov	r12,QWORD PTR[32+rcx]
+	mov	QWORD PTR[40+rdi],r13
+	sbb	rsi,QWORD PTR[88+rdx]
+	mov	r13,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r8,rdx
+	and	r9,rdx
+	and	r10,rdx
+	and	r11,rdx
+	and	r12,rdx
+	and	r13,rdx
+
+	add	r14,r8
+	adc	r15,r9
+	mov	QWORD PTR[48+rdi],r14
+	adc	rax,r10
+	mov	QWORD PTR[56+rdi],r15
+	adc	rbx,r11
+	mov	QWORD PTR[64+rdi],rax
+	adc	rbp,r12
+	mov	QWORD PTR[72+rdi],rbx
+	adc	rsi,r13
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rsi
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384x384	ENDP
+
+
+ALIGN	32
+__add_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	add	r8,QWORD PTR[rdx]
+	adc	r9,QWORD PTR[8+rdx]
+	adc	r10,QWORD PTR[16+rdx]
+	mov	r14,r8
+	adc	r11,QWORD PTR[24+rdx]
+	mov	r15,r9
+	adc	r12,QWORD PTR[32+rdx]
+	mov	rax,r10
+	adc	r13,QWORD PTR[40+rdx]
+	mov	rbx,r11
+	sbb	rdx,rdx
+
+	sub	r8,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rcx]
+	mov	rbp,r12
+	sbb	r10,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rcx]
+	mov	rsi,r13
+	sbb	r13,QWORD PTR[40+rcx]
+	sbb	rdx,0
+
+	cmovc	r8,r14
+	cmovc	r9,r15
+	cmovc	r10,rax
+	mov	QWORD PTR[rdi],r8
+	cmovc	r11,rbx
+	mov	QWORD PTR[8+rdi],r9
+	cmovc	r12,rbp
+	mov	QWORD PTR[16+rdi],r10
+	cmovc	r13,rsi
+	mov	QWORD PTR[24+rdi],r11
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__add_mod_384	ENDP
+
+
+ALIGN	32
+__sub_mod_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+__sub_mod_384_a_is_loaded::
+	sub	r8,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rcx]
+	sbb	r9,QWORD PTR[8+rdx]
+	mov	r15,QWORD PTR[8+rcx]
+	sbb	r10,QWORD PTR[16+rdx]
+	mov	rax,QWORD PTR[16+rcx]
+	sbb	r11,QWORD PTR[24+rdx]
+	mov	rbx,QWORD PTR[24+rcx]
+	sbb	r12,QWORD PTR[32+rdx]
+	mov	rbp,QWORD PTR[32+rcx]
+	sbb	r13,QWORD PTR[40+rdx]
+	mov	rsi,QWORD PTR[40+rcx]
+	sbb	rdx,rdx
+
+	and	r14,rdx
+	and	r15,rdx
+	and	rax,rdx
+	and	rbx,rdx
+	and	rbp,rdx
+	and	rsi,rdx
+
+	add	r8,r14
+	adc	r9,r15
+	mov	QWORD PTR[rdi],r8
+	adc	r10,rax
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,rbx
+	mov	QWORD PTR[16+rdi],r10
+	adc	r12,rbp
+	mov	QWORD PTR[24+rdi],r11
+	adc	r13,rsi
+	mov	QWORD PTR[32+rdi],r12
+	mov	QWORD PTR[40+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__sub_mod_384	ENDP
+PUBLIC	mulx_mont_384x
+
+
+ALIGN	32
+mulx_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mulx_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,328
+
+$L$SEH_body_mulx_mont_384x::
+
+
+	mov	rbx,rdx
+	mov	QWORD PTR[32+rsp],rdi
+	mov	QWORD PTR[24+rsp],rsi
+	mov	QWORD PTR[16+rsp],rdx
+	mov	QWORD PTR[8+rsp],rcx
+	mov	QWORD PTR[rsp],r8
+
+
+
+
+	lea	rdi,QWORD PTR[40+rsp]
+	call	__mulx_384
+
+
+	lea	rbx,QWORD PTR[48+rbx]
+	lea	rsi,QWORD PTR[((128+48))+rsi]
+	lea	rdi,QWORD PTR[96+rdi]
+	call	__mulx_384
+
+
+	mov	rcx,QWORD PTR[8+rsp]
+	lea	rsi,QWORD PTR[rbx]
+	lea	rdx,QWORD PTR[((-48))+rbx]
+	lea	rdi,QWORD PTR[((40+192+48))+rsp]
+	call	__add_mod_384
+
+	mov	rsi,QWORD PTR[24+rsp]
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__add_mod_384
+
+	lea	rbx,QWORD PTR[rdi]
+	lea	rsi,QWORD PTR[48+rdi]
+	call	__mulx_384
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[40+rsp]
+	mov	rcx,QWORD PTR[8+rsp]
+	call	__sub_mod_384x384
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[40+rsp]
+	lea	rdx,QWORD PTR[((40+96))+rsp]
+	lea	rdi,QWORD PTR[40+rsp]
+	call	__sub_mod_384x384
+
+	lea	rbx,QWORD PTR[rcx]
+
+
+	lea	rsi,QWORD PTR[40+rsp]
+	mov	rcx,QWORD PTR[rsp]
+	mov	rdi,QWORD PTR[32+rsp]
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+
+	lea	rsi,QWORD PTR[((40+192))+rsp]
+	mov	rcx,QWORD PTR[rsp]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	lea	r8,QWORD PTR[328+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_mulx_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mulx_mont_384x::
+mulx_mont_384x	ENDP
+PUBLIC	sqrx_mont_384x
+
+
+ALIGN	32
+sqrx_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_sqrx_mont_384x::
+
+
+	mov	QWORD PTR[rsp],rcx
+	mov	rcx,rdx
+
+	mov	QWORD PTR[16+rsp],rdi
+	mov	QWORD PTR[24+rsp],rsi
+
+
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[32+rsp]
+	call	__add_mod_384
+
+
+	mov	rsi,QWORD PTR[24+rsp]
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[((32+48))+rsp]
+	call	__sub_mod_384
+
+
+	mov	rsi,QWORD PTR[24+rsp]
+	lea	rbx,QWORD PTR[48+rsi]
+
+	mov	rdx,QWORD PTR[48+rsi]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	r12,QWORD PTR[24+rsi]
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_384
+	add	rdx,rdx
+	adc	r15,r15
+	adc	rax,rax
+	mov	r8,rdx
+	adc	r12,r12
+	mov	r9,r15
+	adc	rdi,rdi
+	mov	r10,rax
+	adc	rbp,rbp
+	mov	r11,r12
+	sbb	rsi,rsi
+
+	sub	rdx,QWORD PTR[rcx]
+	sbb	r15,QWORD PTR[8+rcx]
+	mov	r13,rdi
+	sbb	rax,QWORD PTR[16+rcx]
+	sbb	r12,QWORD PTR[24+rcx]
+	sbb	rdi,QWORD PTR[32+rcx]
+	mov	r14,rbp
+	sbb	rbp,QWORD PTR[40+rcx]
+	sbb	rsi,0
+
+	cmovc	rdx,r8
+	cmovc	r15,r9
+	cmovc	rax,r10
+	mov	QWORD PTR[48+rbx],rdx
+	cmovc	r12,r11
+	mov	QWORD PTR[56+rbx],r15
+	cmovc	rdi,r13
+	mov	QWORD PTR[64+rbx],rax
+	cmovc	rbp,r14
+	mov	QWORD PTR[72+rbx],r12
+	mov	QWORD PTR[80+rbx],rdi
+	mov	QWORD PTR[88+rbx],rbp
+
+	lea	rsi,QWORD PTR[32+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+
+	mov	rdx,QWORD PTR[((32+48))+rsp]
+	mov	r14,QWORD PTR[((32+0))+rsp]
+	mov	r15,QWORD PTR[((32+8))+rsp]
+	mov	rax,QWORD PTR[((32+16))+rsp]
+	mov	r12,QWORD PTR[((32+24))+rsp]
+	mov	rdi,QWORD PTR[((32+32))+rsp]
+	mov	rbp,QWORD PTR[((32+40))+rsp]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqrx_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_mont_384x::
+sqrx_mont_384x	ENDP
+
+PUBLIC	mulx_382x
+
+
+ALIGN	32
+mulx_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mulx_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_mulx_382x::
+
+
+	lea	rdi,QWORD PTR[96+rdi]
+	mov	QWORD PTR[rsp],rsi
+	mov	QWORD PTR[8+rsp],rdx
+	mov	QWORD PTR[16+rsp],rdi
+	mov	QWORD PTR[24+rsp],rcx
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	add	r8,QWORD PTR[48+rsi]
+	adc	r9,QWORD PTR[56+rsi]
+	adc	r10,QWORD PTR[64+rsi]
+	adc	r11,QWORD PTR[72+rsi]
+	adc	r12,QWORD PTR[80+rsi]
+	adc	r13,QWORD PTR[88+rsi]
+
+	mov	QWORD PTR[((32+0))+rsp],r8
+	mov	QWORD PTR[((32+8))+rsp],r9
+	mov	QWORD PTR[((32+16))+rsp],r10
+	mov	QWORD PTR[((32+24))+rsp],r11
+	mov	QWORD PTR[((32+32))+rsp],r12
+	mov	QWORD PTR[((32+40))+rsp],r13
+
+
+	mov	r8,QWORD PTR[rdx]
+	mov	r9,QWORD PTR[8+rdx]
+	mov	r10,QWORD PTR[16+rdx]
+	mov	r11,QWORD PTR[24+rdx]
+	mov	r12,QWORD PTR[32+rdx]
+	mov	r13,QWORD PTR[40+rdx]
+
+	add	r8,QWORD PTR[48+rdx]
+	adc	r9,QWORD PTR[56+rdx]
+	adc	r10,QWORD PTR[64+rdx]
+	adc	r11,QWORD PTR[72+rdx]
+	adc	r12,QWORD PTR[80+rdx]
+	adc	r13,QWORD PTR[88+rdx]
+
+	mov	QWORD PTR[((32+48))+rsp],r8
+	mov	QWORD PTR[((32+56))+rsp],r9
+	mov	QWORD PTR[((32+64))+rsp],r10
+	mov	QWORD PTR[((32+72))+rsp],r11
+	mov	QWORD PTR[((32+80))+rsp],r12
+	mov	QWORD PTR[((32+88))+rsp],r13
+
+
+	lea	rsi,QWORD PTR[((32+0))+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+	call	__mulx_384
+
+
+	mov	rsi,QWORD PTR[rsp]
+	mov	rbx,QWORD PTR[8+rsp]
+	lea	rdi,QWORD PTR[((-96))+rdi]
+	call	__mulx_384
+
+
+	lea	rsi,QWORD PTR[((48+128))+rsi]
+	lea	rbx,QWORD PTR[48+rbx]
+	lea	rdi,QWORD PTR[32+rsp]
+	call	__mulx_384
+
+
+	mov	rsi,QWORD PTR[16+rsp]
+	lea	rdx,QWORD PTR[32+rsp]
+	mov	rcx,QWORD PTR[24+rsp]
+	mov	rdi,rsi
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rdx,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+
+	lea	rsi,QWORD PTR[((-96))+rdi]
+	lea	rdx,QWORD PTR[32+rsp]
+	lea	rdi,QWORD PTR[((-96))+rdi]
+	call	__sub_mod_384x384
+
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_mulx_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mulx_382x::
+mulx_382x	ENDP
+PUBLIC	sqrx_382x
+
+
+ALIGN	32
+sqrx_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rsi
+
+$L$SEH_body_sqrx_382x::
+
+
+	mov	rcx,rdx
+
+
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	rbx,QWORD PTR[24+rsi]
+	mov	rbp,QWORD PTR[32+rsi]
+	mov	rdx,QWORD PTR[40+rsi]
+
+	mov	r8,r14
+	add	r14,QWORD PTR[48+rsi]
+	mov	r9,r15
+	adc	r15,QWORD PTR[56+rsi]
+	mov	r10,rax
+	adc	rax,QWORD PTR[64+rsi]
+	mov	r11,rbx
+	adc	rbx,QWORD PTR[72+rsi]
+	mov	r12,rbp
+	adc	rbp,QWORD PTR[80+rsi]
+	mov	r13,rdx
+	adc	rdx,QWORD PTR[88+rsi]
+
+	mov	QWORD PTR[rdi],r14
+	mov	QWORD PTR[8+rdi],r15
+	mov	QWORD PTR[16+rdi],rax
+	mov	QWORD PTR[24+rdi],rbx
+	mov	QWORD PTR[32+rdi],rbp
+	mov	QWORD PTR[40+rdi],rdx
+
+
+	lea	rdx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[48+rdi]
+	call	__sub_mod_384_a_is_loaded
+
+
+	lea	rsi,QWORD PTR[rdi]
+	lea	rbx,QWORD PTR[((-48))+rdi]
+	lea	rdi,QWORD PTR[((-48))+rdi]
+	call	__mulx_384
+
+
+	mov	rsi,QWORD PTR[rsp]
+	lea	rbx,QWORD PTR[48+rsi]
+	lea	rdi,QWORD PTR[96+rdi]
+	call	__mulx_384
+
+	mov	r8,QWORD PTR[rdi]
+	mov	r9,QWORD PTR[8+rdi]
+	mov	r10,QWORD PTR[16+rdi]
+	mov	r11,QWORD PTR[24+rdi]
+	mov	r12,QWORD PTR[32+rdi]
+	mov	r13,QWORD PTR[40+rdi]
+	mov	r14,QWORD PTR[48+rdi]
+	mov	r15,QWORD PTR[56+rdi]
+	mov	rax,QWORD PTR[64+rdi]
+	mov	rbx,QWORD PTR[72+rdi]
+	mov	rbp,QWORD PTR[80+rdi]
+	add	r8,r8
+	mov	rdx,QWORD PTR[88+rdi]
+	adc	r9,r9
+	mov	QWORD PTR[rdi],r8
+	adc	r10,r10
+	mov	QWORD PTR[8+rdi],r9
+	adc	r11,r11
+	mov	QWORD PTR[16+rdi],r10
+	adc	r12,r12
+	mov	QWORD PTR[24+rdi],r11
+	adc	r13,r13
+	mov	QWORD PTR[32+rdi],r12
+	adc	r14,r14
+	mov	QWORD PTR[40+rdi],r13
+	adc	r15,r15
+	mov	QWORD PTR[48+rdi],r14
+	adc	rax,rax
+	mov	QWORD PTR[56+rdi],r15
+	adc	rbx,rbx
+	mov	QWORD PTR[64+rdi],rax
+	adc	rbp,rbp
+	mov	QWORD PTR[72+rdi],rbx
+	adc	rdx,rdx
+	mov	QWORD PTR[80+rdi],rbp
+	mov	QWORD PTR[88+rdi],rdx
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqrx_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_382x::
+sqrx_382x	ENDP
+PUBLIC	mulx_384
+
+
+ALIGN	32
+mulx_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mulx_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+$L$SEH_body_mulx_384::
+
+
+	mov	rbx,rdx
+	call	__mulx_384
+
+	mov	r15,QWORD PTR[rsp]
+
+	mov	r14,QWORD PTR[8+rsp]
+
+	mov	r13,QWORD PTR[16+rsp]
+
+	mov	r12,QWORD PTR[24+rsp]
+
+	mov	rbx,QWORD PTR[32+rsp]
+
+	mov	rbp,QWORD PTR[40+rsp]
+
+	lea	rsp,QWORD PTR[48+rsp]
+
+$L$SEH_epilogue_mulx_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mulx_384::
+mulx_384	ENDP
+
+
+ALIGN	32
+__mulx_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rdx,QWORD PTR[rbx]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+
+	mulx	rcx,r9,r14
+	xor	rbp,rbp
+
+	mulx	rax,r8,r15
+	adcx	r8,rcx
+	mov	QWORD PTR[rdi],r9
+
+	mulx	rcx,r9,r10
+	adcx	r9,rax
+
+	mulx	rax,r10,r11
+	adcx	r10,rcx
+
+	mulx	rcx,r11,r12
+	adcx	r11,rax
+
+	mulx	r13,r12,r13
+	mov	rdx,QWORD PTR[8+rbx]
+	adcx	r12,rcx
+	adcx	r13,rbp
+	mulx	rcx,rax,r14
+	adcx	rax,r8
+	adox	r9,rcx
+	mov	QWORD PTR[8+rdi],rax
+
+	mulx	rcx,r8,r15
+	adcx	r8,r9
+	adox	r10,rcx
+
+	mulx	rax,r9,QWORD PTR[((128+16))+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+
+	mulx	rcx,r10,QWORD PTR[((128+24))+rsi]
+	adcx	r10,r11
+	adox	r12,rcx
+
+	mulx	rax,r11,QWORD PTR[((128+32))+rsi]
+	adcx	r11,r12
+	adox	rax,r13
+
+	mulx	r13,r12,QWORD PTR[((128+40))+rsi]
+	mov	rdx,QWORD PTR[16+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+	adcx	r13,rbp
+	mulx	rcx,rax,r14
+	adcx	rax,r8
+	adox	r9,rcx
+	mov	QWORD PTR[16+rdi],rax
+
+	mulx	rcx,r8,r15
+	adcx	r8,r9
+	adox	r10,rcx
+
+	mulx	rax,r9,QWORD PTR[((128+16))+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+
+	mulx	rcx,r10,QWORD PTR[((128+24))+rsi]
+	adcx	r10,r11
+	adox	r12,rcx
+
+	mulx	rax,r11,QWORD PTR[((128+32))+rsi]
+	adcx	r11,r12
+	adox	rax,r13
+
+	mulx	r13,r12,QWORD PTR[((128+40))+rsi]
+	mov	rdx,QWORD PTR[24+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+	adcx	r13,rbp
+	mulx	rcx,rax,r14
+	adcx	rax,r8
+	adox	r9,rcx
+	mov	QWORD PTR[24+rdi],rax
+
+	mulx	rcx,r8,r15
+	adcx	r8,r9
+	adox	r10,rcx
+
+	mulx	rax,r9,QWORD PTR[((128+16))+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+
+	mulx	rcx,r10,QWORD PTR[((128+24))+rsi]
+	adcx	r10,r11
+	adox	r12,rcx
+
+	mulx	rax,r11,QWORD PTR[((128+32))+rsi]
+	adcx	r11,r12
+	adox	rax,r13
+
+	mulx	r13,r12,QWORD PTR[((128+40))+rsi]
+	mov	rdx,QWORD PTR[32+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+	adcx	r13,rbp
+	mulx	rcx,rax,r14
+	adcx	rax,r8
+	adox	r9,rcx
+	mov	QWORD PTR[32+rdi],rax
+
+	mulx	rcx,r8,r15
+	adcx	r8,r9
+	adox	r10,rcx
+
+	mulx	rax,r9,QWORD PTR[((128+16))+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+
+	mulx	rcx,r10,QWORD PTR[((128+24))+rsi]
+	adcx	r10,r11
+	adox	r12,rcx
+
+	mulx	rax,r11,QWORD PTR[((128+32))+rsi]
+	adcx	r11,r12
+	adox	rax,r13
+
+	mulx	r13,r12,QWORD PTR[((128+40))+rsi]
+	mov	rdx,QWORD PTR[40+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+	adcx	r13,rbp
+	mulx	rcx,rax,r14
+	adcx	rax,r8
+	adox	r9,rcx
+	mov	QWORD PTR[40+rdi],rax
+
+	mulx	rcx,r8,r15
+	adcx	r8,r9
+	adox	r10,rcx
+
+	mulx	rax,r9,QWORD PTR[((128+16))+rsi]
+	adcx	r9,r10
+	adox	r11,rax
+
+	mulx	rcx,r10,QWORD PTR[((128+24))+rsi]
+	adcx	r10,r11
+	adox	r12,rcx
+
+	mulx	rax,r11,QWORD PTR[((128+32))+rsi]
+	adcx	r11,r12
+	adox	rax,r13
+
+	mulx	r13,r12,QWORD PTR[((128+40))+rsi]
+	mov	rdx,rax
+	adcx	r12,rax
+	adox	r13,rbp
+	adcx	r13,rbp
+	mov	QWORD PTR[48+rdi],r8
+	mov	QWORD PTR[56+rdi],r9
+	mov	QWORD PTR[64+rdi],r10
+	mov	QWORD PTR[72+rdi],r11
+	mov	QWORD PTR[80+rdi],r12
+	mov	QWORD PTR[88+rdi],r13
+
+	DB	0F3h,0C3h		;repret
+__mulx_384	ENDP
+PUBLIC	sqrx_384
+
+
+ALIGN	32
+sqrx_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	push	rdi
+
+$L$SEH_body_sqrx_384::
+
+
+	call	__sqrx_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sqrx_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_384::
+sqrx_384	ENDP
+
+ALIGN	32
+__sqrx_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	rdx,QWORD PTR[rsi]
+	mov	r14,QWORD PTR[8+rsi]
+	mov	r15,QWORD PTR[16+rsi]
+	mov	rcx,QWORD PTR[24+rsi]
+	mov	rbx,QWORD PTR[32+rsi]
+
+
+	mulx	rdi,r8,r14
+	mov	rbp,QWORD PTR[40+rsi]
+	mulx	rax,r9,r15
+	add	r9,rdi
+	mulx	rdi,r10,rcx
+	adc	r10,rax
+	mulx	rax,r11,rbx
+	adc	r11,rdi
+	mulx	r13,r12,rbp
+	mov	rdx,r14
+	adc	r12,rax
+	adc	r13,0
+
+
+	xor	r14,r14
+	mulx	rax,rdi,r15
+	adcx	r10,rdi
+	adox	r11,rax
+
+	mulx	rax,rdi,rcx
+	adcx	r11,rdi
+	adox	r12,rax
+
+	mulx	rax,rdi,rbx
+	adcx	r12,rdi
+	adox	r13,rax
+
+	mulx	rax,rdi,rbp
+	mov	rdx,r15
+	adcx	r13,rdi
+	adox	rax,r14
+	adcx	r14,rax
+
+
+	xor	r15,r15
+	mulx	rax,rdi,rcx
+	adcx	r12,rdi
+	adox	r13,rax
+
+	mulx	rax,rdi,rbx
+	adcx	r13,rdi
+	adox	r14,rax
+
+	mulx	rax,rdi,rbp
+	mov	rdx,rcx
+	adcx	r14,rdi
+	adox	rax,r15
+	adcx	r15,rax
+
+
+	xor	rcx,rcx
+	mulx	rax,rdi,rbx
+	adcx	r14,rdi
+	adox	r15,rax
+
+	mulx	rax,rdi,rbp
+	mov	rdx,rbx
+	adcx	r15,rdi
+	adox	rax,rcx
+	adcx	rcx,rax
+
+
+	mulx	rbx,rdi,rbp
+	mov	rdx,QWORD PTR[rsi]
+	add	rcx,rdi
+	mov	rdi,QWORD PTR[8+rsp]
+	adc	rbx,0
+
+
+	xor	rbp,rbp
+	adcx	r8,r8
+	adcx	r9,r9
+	adcx	r10,r10
+	adcx	r11,r11
+	adcx	r12,r12
+
+
+	mulx	rax,rdx,rdx
+	mov	QWORD PTR[rdi],rdx
+	mov	rdx,QWORD PTR[8+rsi]
+	adox	r8,rax
+	mov	QWORD PTR[8+rdi],r8
+
+	mulx	rax,r8,rdx
+	mov	rdx,QWORD PTR[16+rsi]
+	adox	r9,r8
+	adox	r10,rax
+	mov	QWORD PTR[16+rdi],r9
+	mov	QWORD PTR[24+rdi],r10
+
+	mulx	r9,r8,rdx
+	mov	rdx,QWORD PTR[24+rsi]
+	adox	r11,r8
+	adox	r12,r9
+	adcx	r13,r13
+	adcx	r14,r14
+	mov	QWORD PTR[32+rdi],r11
+	mov	QWORD PTR[40+rdi],r12
+
+	mulx	r9,r8,rdx
+	mov	rdx,QWORD PTR[32+rsi]
+	adox	r13,r8
+	adox	r14,r9
+	adcx	r15,r15
+	adcx	rcx,rcx
+	mov	QWORD PTR[48+rdi],r13
+	mov	QWORD PTR[56+rdi],r14
+
+	mulx	r9,r8,rdx
+	mov	rdx,QWORD PTR[40+rsi]
+	adox	r15,r8
+	adox	rcx,r9
+	adcx	rbx,rbx
+	adcx	rbp,rbp
+	mov	QWORD PTR[64+rdi],r15
+	mov	QWORD PTR[72+rdi],rcx
+
+	mulx	r9,r8,rdx
+	adox	rbx,r8
+	adox	rbp,r9
+
+	mov	QWORD PTR[80+rdi],rbx
+	mov	QWORD PTR[88+rdi],rbp
+
+	DB	0F3h,0C3h		;repret
+__sqrx_384	ENDP
+
+
+
+PUBLIC	redcx_mont_384
+
+
+ALIGN	32
+redcx_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_redcx_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_redcx_mont_384::
+
+
+	mov	rbx,rdx
+	call	__mulx_by_1_mont_384
+	call	__redc_tail_mont_384
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_redcx_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_redcx_mont_384::
+redcx_mont_384	ENDP
+
+
+
+
+PUBLIC	fromx_mont_384
+
+
+ALIGN	32
+fromx_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_fromx_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_fromx_mont_384::
+
+
+	mov	rbx,rdx
+	call	__mulx_by_1_mont_384
+
+
+
+
+	mov	rax,r14
+	mov	rcx,r15
+	mov	rdx,r8
+	mov	rbp,r9
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	mov	r13,r10
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	mov	rsi,r11
+	sbb	r11,QWORD PTR[40+rbx]
+
+	cmovc	r14,rax
+	cmovc	r15,rcx
+	cmovc	r8,rdx
+	mov	QWORD PTR[rdi],r14
+	cmovc	r9,rbp
+	mov	QWORD PTR[8+rdi],r15
+	cmovc	r10,r13
+	mov	QWORD PTR[16+rdi],r8
+	cmovc	r11,rsi
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_fromx_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_fromx_mont_384::
+fromx_mont_384	ENDP
+
+ALIGN	32
+__mulx_by_1_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rsi]
+	mov	rdx,rcx
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+	imul	rdx,r8
+
+
+	xor	r14,r14
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r8,rax
+	adox	r9,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r9,rax
+	adox	r10,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r10,rax
+	adox	r11,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r11,rax
+	adox	r12,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r13,rax
+	adox	rbp,r14
+	adcx	r14,rbp
+	imul	rdx,r9
+
+
+	xor	r15,r15
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r9,rax
+	adox	r10,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r10,rax
+	adox	r11,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r11,rax
+	adox	r12,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r13,rax
+	adox	r14,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r14,rax
+	adox	rbp,r15
+	adcx	r15,rbp
+	imul	rdx,r10
+
+
+	xor	r8,r8
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r10,rax
+	adox	r11,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r11,rax
+	adox	r12,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r13,rax
+	adox	r14,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r14,rax
+	adox	r15,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r15,rax
+	adox	rbp,r8
+	adcx	r8,rbp
+	imul	rdx,r11
+
+
+	xor	r9,r9
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r11,rax
+	adox	r12,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r13,rax
+	adox	r14,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r14,rax
+	adox	r15,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r15,rax
+	adox	r8,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r8,rax
+	adox	rbp,r9
+	adcx	r9,rbp
+	imul	rdx,r12
+
+
+	xor	r10,r10
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r12,rax
+	adox	r13,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r13,rax
+	adox	r14,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r14,rax
+	adox	r15,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r15,rax
+	adox	r8,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r8,rax
+	adox	r9,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r9,rax
+	adox	rbp,r10
+	adcx	r10,rbp
+	imul	rdx,r13
+
+
+	xor	r11,r11
+	mulx	rbp,rax,QWORD PTR[rbx]
+	adcx	r13,rax
+	adox	r14,rbp
+
+	mulx	rbp,rax,QWORD PTR[8+rbx]
+	adcx	r14,rax
+	adox	r15,rbp
+
+	mulx	rbp,rax,QWORD PTR[16+rbx]
+	adcx	r15,rax
+	adox	r8,rbp
+
+	mulx	rbp,rax,QWORD PTR[24+rbx]
+	adcx	r8,rax
+	adox	r9,rbp
+
+	mulx	rbp,rax,QWORD PTR[32+rbx]
+	adcx	r9,rax
+	adox	r10,rbp
+
+	mulx	rbp,rax,QWORD PTR[40+rbx]
+	mov	rdx,rcx
+	adcx	r10,rax
+	adox	rbp,r11
+	adcx	r11,rbp
+	DB	0F3h,0C3h		;repret
+__mulx_by_1_mont_384	ENDP
+
+
+ALIGN	32
+__redc_tail_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+	add	r14,QWORD PTR[48+rsi]
+	mov	rax,r14
+	adc	r15,QWORD PTR[56+rsi]
+	adc	r8,QWORD PTR[64+rsi]
+	adc	r9,QWORD PTR[72+rsi]
+	mov	rcx,r15
+	adc	r10,QWORD PTR[80+rsi]
+	adc	r11,QWORD PTR[88+rsi]
+	sbb	r12,r12
+
+
+
+
+	mov	rdx,r8
+	mov	rbp,r9
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	mov	r13,r10
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	mov	rsi,r11
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	r12,0
+
+	cmovc	r14,rax
+	cmovc	r15,rcx
+	cmovc	r8,rdx
+	mov	QWORD PTR[rdi],r14
+	cmovc	r9,rbp
+	mov	QWORD PTR[8+rdi],r15
+	cmovc	r10,r13
+	mov	QWORD PTR[16+rdi],r8
+	cmovc	r11,rsi
+	mov	QWORD PTR[24+rdi],r9
+	mov	QWORD PTR[32+rdi],r10
+	mov	QWORD PTR[40+rdi],r11
+
+	DB	0F3h,0C3h		;repret
+__redc_tail_mont_384	ENDP
+
+PUBLIC	sgn0x_pty_mont_384
+
+
+ALIGN	32
+sgn0x_pty_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0x_pty_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sgn0x_pty_mont_384::
+
+
+	mov	rbx,rsi
+	lea	rsi,QWORD PTR[rdi]
+	mov	rcx,rdx
+	call	__mulx_by_1_mont_384
+
+	xor	rax,rax
+	mov	r13,r14
+	add	r14,r14
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rax,0
+
+	sub	r14,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rax,0
+
+	not	rax
+	and	r13,1
+	and	rax,2
+	or	rax,r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sgn0x_pty_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0x_pty_mont_384::
+sgn0x_pty_mont_384	ENDP
+
+PUBLIC	sgn0x_pty_mont_384x
+
+
+ALIGN	32
+sgn0x_pty_mont_384x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sgn0x_pty_mont_384x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,8
+
+$L$SEH_body_sgn0x_pty_mont_384x::
+
+
+	mov	rbx,rsi
+	lea	rsi,QWORD PTR[48+rdi]
+	mov	rcx,rdx
+	call	__mulx_by_1_mont_384
+
+	mov	r12,r14
+	or	r14,r15
+	or	r14,r8
+	or	r14,r9
+	or	r14,r10
+	or	r14,r11
+
+	lea	rsi,QWORD PTR[rdi]
+	xor	rdi,rdi
+	mov	r13,r12
+	add	r12,r12
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rdi,0
+
+	sub	r12,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rdi,0
+
+	mov	QWORD PTR[rsp],r14
+	not	rdi
+	and	r13,1
+	and	rdi,2
+	or	rdi,r13
+
+	call	__mulx_by_1_mont_384
+
+	mov	r12,r14
+	or	r14,r15
+	or	r14,r8
+	or	r14,r9
+	or	r14,r10
+	or	r14,r11
+
+	xor	rax,rax
+	mov	r13,r12
+	add	r12,r12
+	adc	r15,r15
+	adc	r8,r8
+	adc	r9,r9
+	adc	r10,r10
+	adc	r11,r11
+	adc	rax,0
+
+	sub	r12,QWORD PTR[rbx]
+	sbb	r15,QWORD PTR[8+rbx]
+	sbb	r8,QWORD PTR[16+rbx]
+	sbb	r9,QWORD PTR[24+rbx]
+	sbb	r10,QWORD PTR[32+rbx]
+	sbb	r11,QWORD PTR[40+rbx]
+	sbb	rax,0
+
+	mov	r12,QWORD PTR[rsp]
+
+	not	rax
+
+	test	r14,r14
+	cmovz	r13,rdi
+
+	test	r12,r12
+	cmovnz	rax,rdi
+
+	and	r13,1
+	and	rax,2
+	or	rax,r13
+
+	mov	r15,QWORD PTR[8+rsp]
+
+	mov	r14,QWORD PTR[16+rsp]
+
+	mov	r13,QWORD PTR[24+rsp]
+
+	mov	r12,QWORD PTR[32+rsp]
+
+	mov	rbx,QWORD PTR[40+rsp]
+
+	mov	rbp,QWORD PTR[48+rsp]
+
+	lea	rsp,QWORD PTR[56+rsp]
+
+$L$SEH_epilogue_sgn0x_pty_mont_384x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sgn0x_pty_mont_384x::
+sgn0x_pty_mont_384x	ENDP
+PUBLIC	mulx_mont_384
+
+
+ALIGN	32
+mulx_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_mulx_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,QWORD PTR[((-24))+rsp]
+
+$L$SEH_body_mulx_mont_384::
+
+
+	mov	rbx,rdx
+	mov	rdx,QWORD PTR[rdx]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	r12,QWORD PTR[24+rsi]
+	mov	QWORD PTR[16+rsp],rdi
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+	mov	QWORD PTR[rsp],r8
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_384
+
+	mov	r15,QWORD PTR[24+rsp]
+
+	mov	r14,QWORD PTR[32+rsp]
+
+	mov	r13,QWORD PTR[40+rsp]
+
+	mov	r12,QWORD PTR[48+rsp]
+
+	mov	rbx,QWORD PTR[56+rsp]
+
+	mov	rbp,QWORD PTR[64+rsp]
+
+	lea	rsp,QWORD PTR[72+rsp]
+
+$L$SEH_epilogue_mulx_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_mulx_mont_384::
+mulx_mont_384	ENDP
+
+ALIGN	32
+__mulx_mont_384	PROC PRIVATE
+	DB	243,15,30,250
+
+	mulx	r10,r14,r15
+	mulx	r11,r15,rax
+	add	r9,r14
+	mulx	r12,rax,r12
+	adc	r10,r15
+	mulx	r13,rdi,rdi
+	adc	r11,rax
+	mulx	r14,rbp,rbp
+	mov	rdx,QWORD PTR[8+rbx]
+	adc	r12,rdi
+	adc	r13,rbp
+	adc	r14,0
+	xor	r15,r15
+
+	mov	QWORD PTR[16+rsp],r8
+	imul	r8,QWORD PTR[8+rsp]
+
+
+	xor	rax,rax
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r9,rdi
+	adcx	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r10,rdi
+	adcx	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r8
+	adox	r14,rdi
+	adcx	r15,rbp
+	adox	r15,rax
+	adox	rax,rax
+
+
+	xor	r8,r8
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rdi,QWORD PTR[16+rsp]
+	adox	r9,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r9,rdi
+	adox	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r10,rdi
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[16+rbx]
+	adcx	r13,rdi
+	adox	r14,rbp
+	adcx	r14,r8
+	adox	r15,r8
+	adcx	r15,r8
+	adox	rax,r8
+	adcx	rax,r8
+	mov	QWORD PTR[16+rsp],r9
+	imul	r9,QWORD PTR[8+rsp]
+
+
+	xor	r8,r8
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r10,rdi
+	adcx	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r9
+	adox	r15,rdi
+	adcx	rax,rbp
+	adox	rax,r8
+	adox	r8,r8
+
+
+	xor	r9,r9
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rdi,QWORD PTR[16+rsp]
+	adox	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r10,rdi
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[24+rbx]
+	adcx	r14,rdi
+	adox	r15,rbp
+	adcx	r15,r9
+	adox	rax,r9
+	adcx	rax,r9
+	adox	r8,r9
+	adcx	r8,r9
+	mov	QWORD PTR[16+rsp],r10
+	imul	r10,QWORD PTR[8+rsp]
+
+
+	xor	r9,r9
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r10
+	adox	rax,rdi
+	adcx	r8,rbp
+	adox	r8,r9
+	adox	r9,r9
+
+
+	xor	r10,r10
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rdi,QWORD PTR[16+rsp]
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[32+rbx]
+	adcx	r15,rdi
+	adox	rax,rbp
+	adcx	rax,r10
+	adox	r8,r10
+	adcx	r8,r10
+	adox	r9,r10
+	adcx	r9,r10
+	mov	QWORD PTR[16+rsp],r11
+	imul	r11,QWORD PTR[8+rsp]
+
+
+	xor	r10,r10
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	rax,rdi
+	adcx	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r11
+	adox	r8,rdi
+	adcx	r9,rbp
+	adox	r9,r10
+	adox	r10,r10
+
+
+	xor	r11,r11
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rdi,QWORD PTR[16+rsp]
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[40+rbx]
+	adcx	rax,rdi
+	adox	r8,rbp
+	adcx	r8,r11
+	adox	r9,r11
+	adcx	r9,r11
+	adox	r10,r11
+	adcx	r10,r11
+	mov	QWORD PTR[16+rsp],r12
+	imul	r12,QWORD PTR[8+rsp]
+
+
+	xor	r11,r11
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	rax,rdi
+	adcx	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r8,rdi
+	adcx	r9,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r12
+	adox	r9,rdi
+	adcx	r10,rbp
+	adox	r10,r11
+	adox	r11,r11
+
+
+	xor	r12,r12
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rdi,QWORD PTR[16+rsp]
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	rax,rdi
+	adox	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,r13
+	adcx	r8,rdi
+	adox	r9,rbp
+	adcx	r9,r12
+	adox	r10,r12
+	adcx	r10,r12
+	adox	r11,r12
+	adcx	r11,r12
+	imul	rdx,QWORD PTR[8+rsp]
+	mov	rbx,QWORD PTR[24+rsp]
+
+
+	xor	r12,r12
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	rax,rdi
+	adox	r8,rbp
+	mov	r13,r15
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r8,rdi
+	adox	r9,rbp
+	mov	rsi,rax
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	adcx	r9,rdi
+	adox	r10,rbp
+	mov	rdx,r14
+	adcx	r10,r12
+	adox	r11,r12
+	lea	rcx,QWORD PTR[128+rcx]
+	mov	r12,r8
+	adc	r11,0
+
+
+
+
+	sub	r14,QWORD PTR[rcx]
+	sbb	r15,QWORD PTR[8+rcx]
+	mov	rdi,r9
+	sbb	rax,QWORD PTR[16+rcx]
+	sbb	r8,QWORD PTR[24+rcx]
+	sbb	r9,QWORD PTR[32+rcx]
+	mov	rbp,r10
+	sbb	r10,QWORD PTR[40+rcx]
+	sbb	r11,0
+
+	cmovnc	rdx,r14
+	cmovc	r15,r13
+	cmovc	rax,rsi
+	cmovnc	r12,r8
+	mov	QWORD PTR[rbx],rdx
+	cmovnc	rdi,r9
+	mov	QWORD PTR[8+rbx],r15
+	cmovnc	rbp,r10
+	mov	QWORD PTR[16+rbx],rax
+	mov	QWORD PTR[24+rbx],r12
+	mov	QWORD PTR[32+rbx],rdi
+	mov	QWORD PTR[40+rbx],rbp
+
+	DB	0F3h,0C3h		;repret
+
+__mulx_mont_384	ENDP
+PUBLIC	sqrx_mont_384
+
+
+ALIGN	32
+sqrx_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,QWORD PTR[((-24))+rsp]
+
+$L$SEH_body_sqrx_mont_384::
+
+
+	mov	r8,rcx
+	lea	rcx,QWORD PTR[((-128))+rdx]
+	mov	rdx,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	r12,QWORD PTR[24+rsi]
+	mov	QWORD PTR[16+rsp],rdi
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+
+	lea	rbx,QWORD PTR[rsi]
+	mov	QWORD PTR[rsp],r8
+	lea	rsi,QWORD PTR[((-128))+rsi]
+
+	mulx	r9,r8,rdx
+	call	__mulx_mont_384
+
+	mov	r15,QWORD PTR[24+rsp]
+
+	mov	r14,QWORD PTR[32+rsp]
+
+	mov	r13,QWORD PTR[40+rsp]
+
+	mov	r12,QWORD PTR[48+rsp]
+
+	mov	rbx,QWORD PTR[56+rsp]
+
+	mov	rbp,QWORD PTR[64+rsp]
+
+	lea	rsp,QWORD PTR[72+rsp]
+
+$L$SEH_epilogue_sqrx_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_mont_384::
+sqrx_mont_384	ENDP
+
+PUBLIC	sqrx_n_mul_mont_384
+
+
+ALIGN	32
+sqrx_n_mul_mont_384	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_n_mul_mont_384::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+	mov	r9,QWORD PTR[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,QWORD PTR[((-40))+rsp]
+
+$L$SEH_body_sqrx_n_mul_mont_384::
+
+
+	mov	r10,rdx
+	mov	rdx,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	rbx,rsi
+	mov	r12,QWORD PTR[24+rsi]
+	mov	QWORD PTR[16+rsp],rdi
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+
+	mov	QWORD PTR[rsp],r8
+	mov	QWORD PTR[24+rsp],r9
+	movq	xmm2,QWORD PTR[r9]
+
+$L$oop_sqrx_384::
+	movd	xmm1,r10d
+	lea	rsi,QWORD PTR[((-128))+rbx]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r9,r8,rdx
+	call	__mulx_mont_384
+
+	movd	r10d,xmm1
+	dec	r10d
+	jnz	$L$oop_sqrx_384
+
+	mov	r14,rdx
+DB	102,72,15,126,210
+	lea	rsi,QWORD PTR[((-128))+rbx]
+	mov	rbx,QWORD PTR[24+rsp]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_384
+
+	mov	r15,QWORD PTR[40+rsp]
+
+	mov	r14,QWORD PTR[48+rsp]
+
+	mov	r13,QWORD PTR[56+rsp]
+
+	mov	r12,QWORD PTR[64+rsp]
+
+	mov	rbx,QWORD PTR[72+rsp]
+
+	mov	rbp,QWORD PTR[80+rsp]
+
+	lea	rsp,QWORD PTR[88+rsp]
+
+$L$SEH_epilogue_sqrx_n_mul_mont_384::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_n_mul_mont_384::
+sqrx_n_mul_mont_384	ENDP
+
+PUBLIC	sqrx_n_mul_mont_383
+
+
+ALIGN	32
+sqrx_n_mul_mont_383	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_n_mul_mont_383::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD PTR[40+rsp]
+	mov	r9,QWORD PTR[48+rsp]
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	lea	rsp,QWORD PTR[((-40))+rsp]
+
+$L$SEH_body_sqrx_n_mul_mont_383::
+
+
+	mov	r10,rdx
+	mov	rdx,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	rbx,rsi
+	mov	r12,QWORD PTR[24+rsi]
+	mov	QWORD PTR[16+rsp],rdi
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+
+	mov	QWORD PTR[rsp],r8
+	mov	QWORD PTR[24+rsp],r9
+	movq	xmm2,QWORD PTR[r9]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+$L$oop_sqrx_383::
+	movd	xmm1,r10d
+	lea	rsi,QWORD PTR[((-128))+rbx]
+
+	mulx	r9,r8,rdx
+	call	__mulx_mont_383_nonred
+
+	movd	r10d,xmm1
+	dec	r10d
+	jnz	$L$oop_sqrx_383
+
+	mov	r14,rdx
+DB	102,72,15,126,210
+	lea	rsi,QWORD PTR[((-128))+rbx]
+	mov	rbx,QWORD PTR[24+rsp]
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_384
+
+	mov	r15,QWORD PTR[40+rsp]
+
+	mov	r14,QWORD PTR[48+rsp]
+
+	mov	r13,QWORD PTR[56+rsp]
+
+	mov	r12,QWORD PTR[64+rsp]
+
+	mov	rbx,QWORD PTR[72+rsp]
+
+	mov	rbp,QWORD PTR[80+rsp]
+
+	lea	rsp,QWORD PTR[88+rsp]
+
+$L$SEH_epilogue_sqrx_n_mul_mont_383::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_n_mul_mont_383::
+sqrx_n_mul_mont_383	ENDP
+
+ALIGN	32
+__mulx_mont_383_nonred	PROC PRIVATE
+	DB	243,15,30,250
+
+	mulx	r10,r14,r15
+	mulx	r11,r15,rax
+	add	r9,r14
+	mulx	r12,rax,r12
+	adc	r10,r15
+	mulx	r13,rdi,rdi
+	adc	r11,rax
+	mulx	r14,rbp,rbp
+	mov	rdx,QWORD PTR[8+rbx]
+	adc	r12,rdi
+	adc	r13,rbp
+	adc	r14,0
+	mov	rax,r8
+	imul	r8,QWORD PTR[8+rsp]
+
+
+	xor	r15,r15
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r9,rdi
+	adcx	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r10,rdi
+	adcx	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r8
+	adox	r14,rdi
+	adcx	rbp,r15
+	adox	r15,rbp
+
+
+	xor	r8,r8
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	rax,rdi
+	adox	r9,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r9,rdi
+	adox	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r10,rdi
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[16+rbx]
+	adcx	r13,rdi
+	adox	r14,rbp
+	adcx	r14,rax
+	adox	r15,rax
+	adcx	r15,rax
+	mov	r8,r9
+	imul	r9,QWORD PTR[8+rsp]
+
+
+	xor	rax,rax
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r10,rdi
+	adcx	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r9
+	adox	r15,rdi
+	adcx	rbp,rax
+	adox	rax,rbp
+
+
+	xor	r9,r9
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r8,rdi
+	adox	r10,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r10,rdi
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[24+rbx]
+	adcx	r14,rdi
+	adox	r15,rbp
+	adcx	r15,r8
+	adox	rax,r8
+	adcx	rax,r8
+	mov	r9,r10
+	imul	r10,QWORD PTR[8+rsp]
+
+
+	xor	r8,r8
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r11,rdi
+	adcx	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r10
+	adox	rax,rdi
+	adcx	rbp,r8
+	adox	r8,rbp
+
+
+	xor	r10,r10
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r9,rdi
+	adox	r11,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r11,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[32+rbx]
+	adcx	r15,rdi
+	adox	rax,rbp
+	adcx	rax,r9
+	adox	r8,r9
+	adcx	r8,r9
+	mov	r10,r11
+	imul	r11,QWORD PTR[8+rsp]
+
+
+	xor	r9,r9
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r12,rdi
+	adcx	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	rax,rdi
+	adcx	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r11
+	adox	r8,rdi
+	adcx	rbp,r9
+	adox	r9,rbp
+
+
+	xor	r11,r11
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r10,rdi
+	adox	r12,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r12,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,QWORD PTR[40+rbx]
+	adcx	rax,rdi
+	adox	r8,rbp
+	adcx	r8,r10
+	adox	r9,r10
+	adcx	r9,r10
+	mov	r11,r12
+	imul	r12,QWORD PTR[8+rsp]
+
+
+	xor	r10,r10
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rsi]
+	adox	r13,rdi
+	adcx	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rsi]
+	adox	r14,rdi
+	adcx	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rsi]
+	adox	r15,rdi
+	adcx	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rsi]
+	adox	rax,rdi
+	adcx	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rsi]
+	adox	r8,rdi
+	adcx	r9,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rsi]
+	mov	rdx,r12
+	adox	r9,rdi
+	adcx	rbp,r10
+	adox	r10,rbp
+
+
+	xor	r12,r12
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r11,rdi
+	adox	r13,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	rax,rdi
+	adox	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,r13
+	adcx	r8,rdi
+	adox	r9,rbp
+	adcx	r9,r11
+	adox	r10,r11
+	adcx	r10,r11
+	imul	rdx,QWORD PTR[8+rsp]
+	mov	rbx,QWORD PTR[24+rsp]
+
+
+	xor	r12,r12
+	mulx	rbp,rdi,QWORD PTR[((0+128))+rcx]
+	adcx	r13,rdi
+	adox	r14,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((8+128))+rcx]
+	adcx	r14,rdi
+	adox	r15,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((16+128))+rcx]
+	adcx	r15,rdi
+	adox	rax,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((24+128))+rcx]
+	adcx	rax,rdi
+	adox	r8,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((32+128))+rcx]
+	adcx	r8,rdi
+	adox	r9,rbp
+
+	mulx	rbp,rdi,QWORD PTR[((40+128))+rcx]
+	mov	rdx,r14
+	adcx	r9,rdi
+	adox	r10,rbp
+	adc	r10,0
+	mov	r12,r8
+
+	mov	QWORD PTR[rbx],r14
+	mov	QWORD PTR[8+rbx],r15
+	mov	QWORD PTR[16+rbx],rax
+	mov	rdi,r9
+	mov	QWORD PTR[24+rbx],r8
+	mov	QWORD PTR[32+rbx],r9
+	mov	QWORD PTR[40+rbx],r10
+	mov	rbp,r10
+
+	DB	0F3h,0C3h		;repret
+
+__mulx_mont_383_nonred	ENDP
+PUBLIC	sqrx_mont_382x
+
+
+ALIGN	32
+sqrx_mont_382x	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_sqrx_mont_382x::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	sub	rsp,136
+
+$L$SEH_body_sqrx_mont_382x::
+
+
+	mov	QWORD PTR[rsp],rcx
+	mov	rcx,rdx
+	mov	QWORD PTR[16+rsp],rdi
+	mov	QWORD PTR[24+rsp],rsi
+
+
+	mov	r8,QWORD PTR[rsi]
+	mov	r9,QWORD PTR[8+rsi]
+	mov	r10,QWORD PTR[16+rsi]
+	mov	r11,QWORD PTR[24+rsi]
+	mov	r12,QWORD PTR[32+rsi]
+	mov	r13,QWORD PTR[40+rsi]
+
+	mov	r14,r8
+	add	r8,QWORD PTR[48+rsi]
+	mov	r15,r9
+	adc	r9,QWORD PTR[56+rsi]
+	mov	rax,r10
+	adc	r10,QWORD PTR[64+rsi]
+	mov	rdx,r11
+	adc	r11,QWORD PTR[72+rsi]
+	mov	rbx,r12
+	adc	r12,QWORD PTR[80+rsi]
+	mov	rbp,r13
+	adc	r13,QWORD PTR[88+rsi]
+
+	sub	r14,QWORD PTR[48+rsi]
+	sbb	r15,QWORD PTR[56+rsi]
+	sbb	rax,QWORD PTR[64+rsi]
+	sbb	rdx,QWORD PTR[72+rsi]
+	sbb	rbx,QWORD PTR[80+rsi]
+	sbb	rbp,QWORD PTR[88+rsi]
+	sbb	rdi,rdi
+
+	mov	QWORD PTR[((32+0))+rsp],r8
+	mov	QWORD PTR[((32+8))+rsp],r9
+	mov	QWORD PTR[((32+16))+rsp],r10
+	mov	QWORD PTR[((32+24))+rsp],r11
+	mov	QWORD PTR[((32+32))+rsp],r12
+	mov	QWORD PTR[((32+40))+rsp],r13
+
+	mov	QWORD PTR[((32+48))+rsp],r14
+	mov	QWORD PTR[((32+56))+rsp],r15
+	mov	QWORD PTR[((32+64))+rsp],rax
+	mov	QWORD PTR[((32+72))+rsp],rdx
+	mov	QWORD PTR[((32+80))+rsp],rbx
+	mov	QWORD PTR[((32+88))+rsp],rbp
+	mov	QWORD PTR[((32+96))+rsp],rdi
+
+
+
+	lea	rbx,QWORD PTR[48+rsi]
+
+	mov	rdx,QWORD PTR[48+rsi]
+	mov	r14,QWORD PTR[rsi]
+	mov	r15,QWORD PTR[8+rsi]
+	mov	rax,QWORD PTR[16+rsi]
+	mov	r12,QWORD PTR[24+rsi]
+	mov	rdi,QWORD PTR[32+rsi]
+	mov	rbp,QWORD PTR[40+rsi]
+	lea	rsi,QWORD PTR[((-128))+rsi]
+	lea	rcx,QWORD PTR[((-128))+rcx]
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_383_nonred
+	add	rdx,rdx
+	adc	r15,r15
+	adc	rax,rax
+	adc	r12,r12
+	adc	rdi,rdi
+	adc	rbp,rbp
+
+	mov	QWORD PTR[48+rbx],rdx
+	mov	QWORD PTR[56+rbx],r15
+	mov	QWORD PTR[64+rbx],rax
+	mov	QWORD PTR[72+rbx],r12
+	mov	QWORD PTR[80+rbx],rdi
+	mov	QWORD PTR[88+rbx],rbp
+
+	lea	rsi,QWORD PTR[((32-128))+rsp]
+	lea	rbx,QWORD PTR[((32+48))+rsp]
+
+	mov	rdx,QWORD PTR[((32+48))+rsp]
+	mov	r14,QWORD PTR[((32+0))+rsp]
+	mov	r15,QWORD PTR[((32+8))+rsp]
+	mov	rax,QWORD PTR[((32+16))+rsp]
+	mov	r12,QWORD PTR[((32+24))+rsp]
+	mov	rdi,QWORD PTR[((32+32))+rsp]
+	mov	rbp,QWORD PTR[((32+40))+rsp]
+
+
+
+	mulx	r9,r8,r14
+	call	__mulx_mont_383_nonred
+	mov	r14,QWORD PTR[((32+96))+rsp]
+	lea	rcx,QWORD PTR[128+rcx]
+	mov	r8,QWORD PTR[((32+0))+rsp]
+	and	r8,r14
+	mov	r9,QWORD PTR[((32+8))+rsp]
+	and	r9,r14
+	mov	r10,QWORD PTR[((32+16))+rsp]
+	and	r10,r14
+	mov	r11,QWORD PTR[((32+24))+rsp]
+	and	r11,r14
+	mov	r13,QWORD PTR[((32+32))+rsp]
+	and	r13,r14
+	and	r14,QWORD PTR[((32+40))+rsp]
+
+	sub	rdx,r8
+	mov	r8,QWORD PTR[rcx]
+	sbb	r15,r9
+	mov	r9,QWORD PTR[8+rcx]
+	sbb	rax,r10
+	mov	r10,QWORD PTR[16+rcx]
+	sbb	r12,r11
+	mov	r11,QWORD PTR[24+rcx]
+	sbb	rdi,r13
+	mov	r13,QWORD PTR[32+rcx]
+	sbb	rbp,r14
+	sbb	r14,r14
+
+	and	r8,r14
+	and	r9,r14
+	and	r10,r14
+	and	r11,r14
+	and	r13,r14
+	and	r14,QWORD PTR[40+rcx]
+
+	add	rdx,r8
+	adc	r15,r9
+	adc	rax,r10
+	adc	r12,r11
+	adc	rdi,r13
+	adc	rbp,r14
+
+	mov	QWORD PTR[rbx],rdx
+	mov	QWORD PTR[8+rbx],r15
+	mov	QWORD PTR[16+rbx],rax
+	mov	QWORD PTR[24+rbx],r12
+	mov	QWORD PTR[32+rbx],rdi
+	mov	QWORD PTR[40+rbx],rbp
+	lea	r8,QWORD PTR[136+rsp]
+	mov	r15,QWORD PTR[r8]
+
+	mov	r14,QWORD PTR[8+r8]
+
+	mov	r13,QWORD PTR[16+r8]
+
+	mov	r12,QWORD PTR[24+r8]
+
+	mov	rbx,QWORD PTR[32+r8]
+
+	mov	rbp,QWORD PTR[40+r8]
+
+	lea	rsp,QWORD PTR[48+r8]
+
+$L$SEH_epilogue_sqrx_mont_382x::
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sqrx_mont_382x::
+sqrx_mont_382x	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_mulx_mont_384x
+	DD	imagerel $L$SEH_body_mulx_mont_384x
+	DD	imagerel $L$SEH_info_mulx_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_mulx_mont_384x
+	DD	imagerel $L$SEH_epilogue_mulx_mont_384x
+	DD	imagerel $L$SEH_info_mulx_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_mulx_mont_384x
+	DD	imagerel $L$SEH_end_mulx_mont_384x
+	DD	imagerel $L$SEH_info_mulx_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_mont_384x
+	DD	imagerel $L$SEH_body_sqrx_mont_384x
+	DD	imagerel $L$SEH_info_sqrx_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_mont_384x
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_384x
+	DD	imagerel $L$SEH_info_sqrx_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_384x
+	DD	imagerel $L$SEH_end_sqrx_mont_384x
+	DD	imagerel $L$SEH_info_sqrx_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mulx_382x
+	DD	imagerel $L$SEH_body_mulx_382x
+	DD	imagerel $L$SEH_info_mulx_382x_prologue
+
+	DD	imagerel $L$SEH_body_mulx_382x
+	DD	imagerel $L$SEH_epilogue_mulx_382x
+	DD	imagerel $L$SEH_info_mulx_382x_body
+
+	DD	imagerel $L$SEH_epilogue_mulx_382x
+	DD	imagerel $L$SEH_end_mulx_382x
+	DD	imagerel $L$SEH_info_mulx_382x_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_382x
+	DD	imagerel $L$SEH_body_sqrx_382x
+	DD	imagerel $L$SEH_info_sqrx_382x_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_382x
+	DD	imagerel $L$SEH_epilogue_sqrx_382x
+	DD	imagerel $L$SEH_info_sqrx_382x_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_382x
+	DD	imagerel $L$SEH_end_sqrx_382x
+	DD	imagerel $L$SEH_info_sqrx_382x_epilogue
+
+	DD	imagerel $L$SEH_begin_mulx_384
+	DD	imagerel $L$SEH_body_mulx_384
+	DD	imagerel $L$SEH_info_mulx_384_prologue
+
+	DD	imagerel $L$SEH_body_mulx_384
+	DD	imagerel $L$SEH_epilogue_mulx_384
+	DD	imagerel $L$SEH_info_mulx_384_body
+
+	DD	imagerel $L$SEH_epilogue_mulx_384
+	DD	imagerel $L$SEH_end_mulx_384
+	DD	imagerel $L$SEH_info_mulx_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_384
+	DD	imagerel $L$SEH_body_sqrx_384
+	DD	imagerel $L$SEH_info_sqrx_384_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_384
+	DD	imagerel $L$SEH_epilogue_sqrx_384
+	DD	imagerel $L$SEH_info_sqrx_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_384
+	DD	imagerel $L$SEH_end_sqrx_384
+	DD	imagerel $L$SEH_info_sqrx_384_epilogue
+
+	DD	imagerel $L$SEH_begin_redcx_mont_384
+	DD	imagerel $L$SEH_body_redcx_mont_384
+	DD	imagerel $L$SEH_info_redcx_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_redcx_mont_384
+	DD	imagerel $L$SEH_epilogue_redcx_mont_384
+	DD	imagerel $L$SEH_info_redcx_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_redcx_mont_384
+	DD	imagerel $L$SEH_end_redcx_mont_384
+	DD	imagerel $L$SEH_info_redcx_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_fromx_mont_384
+	DD	imagerel $L$SEH_body_fromx_mont_384
+	DD	imagerel $L$SEH_info_fromx_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_fromx_mont_384
+	DD	imagerel $L$SEH_epilogue_fromx_mont_384
+	DD	imagerel $L$SEH_info_fromx_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_fromx_mont_384
+	DD	imagerel $L$SEH_end_fromx_mont_384
+	DD	imagerel $L$SEH_info_fromx_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_body_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_epilogue_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_end_sgn0x_pty_mont_384
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_body_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue
+
+	DD	imagerel $L$SEH_body_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384x_body
+
+	DD	imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_end_sgn0x_pty_mont_384x
+	DD	imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue
+
+	DD	imagerel $L$SEH_begin_mulx_mont_384
+	DD	imagerel $L$SEH_body_mulx_mont_384
+	DD	imagerel $L$SEH_info_mulx_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_mulx_mont_384
+	DD	imagerel $L$SEH_epilogue_mulx_mont_384
+	DD	imagerel $L$SEH_info_mulx_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_mulx_mont_384
+	DD	imagerel $L$SEH_end_mulx_mont_384
+	DD	imagerel $L$SEH_info_mulx_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_mont_384
+	DD	imagerel $L$SEH_body_sqrx_mont_384
+	DD	imagerel $L$SEH_info_sqrx_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_mont_384
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_384
+	DD	imagerel $L$SEH_info_sqrx_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_384
+	DD	imagerel $L$SEH_end_sqrx_mont_384
+	DD	imagerel $L$SEH_info_sqrx_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_body_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_384_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_end_sqrx_n_mul_mont_384
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_body_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_383_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_end_sqrx_n_mul_mont_383
+	DD	imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue
+
+	DD	imagerel $L$SEH_begin_sqrx_mont_382x
+	DD	imagerel $L$SEH_body_sqrx_mont_382x
+	DD	imagerel $L$SEH_info_sqrx_mont_382x_prologue
+
+	DD	imagerel $L$SEH_body_sqrx_mont_382x
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_382x
+	DD	imagerel $L$SEH_info_sqrx_mont_382x_body
+
+	DD	imagerel $L$SEH_epilogue_sqrx_mont_382x
+	DD	imagerel $L$SEH_end_sqrx_mont_382x
+	DD	imagerel $L$SEH_info_sqrx_mont_382x_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_mulx_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mulx_mont_384x_body::
+DB	1,0,18,0
+DB	000h,0f4h,029h,000h
+DB	000h,0e4h,02ah,000h
+DB	000h,0d4h,02bh,000h
+DB	000h,0c4h,02ch,000h
+DB	000h,034h,02dh,000h
+DB	000h,054h,02eh,000h
+DB	000h,074h,030h,000h
+DB	000h,064h,031h,000h
+DB	000h,001h,02fh,000h
+$L$SEH_info_mulx_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_mont_384x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqrx_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mulx_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mulx_382x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_mulx_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_382x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqrx_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mulx_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mulx_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,000h,000h
+DB	000h,0e4h,001h,000h
+DB	000h,0d4h,002h,000h
+DB	000h,0c4h,003h,000h
+DB	000h,034h,004h,000h
+DB	000h,054h,005h,000h
+DB	000h,074h,007h,000h
+DB	000h,064h,008h,000h
+DB	000h,052h
+DB	000h,000h
+$L$SEH_info_mulx_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sqrx_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_redcx_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_redcx_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_redcx_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_fromx_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_fromx_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_fromx_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0x_pty_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0x_pty_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sgn0x_pty_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sgn0x_pty_mont_384x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sgn0x_pty_mont_384x_body::
+DB	1,0,17,0
+DB	000h,0f4h,001h,000h
+DB	000h,0e4h,002h,000h
+DB	000h,0d4h,003h,000h
+DB	000h,0c4h,004h,000h
+DB	000h,034h,005h,000h
+DB	000h,054h,006h,000h
+DB	000h,074h,008h,000h
+DB	000h,064h,009h,000h
+DB	000h,062h
+DB	000h,000h
+$L$SEH_info_sgn0x_pty_mont_384x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_mulx_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_mulx_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,003h,000h
+DB	000h,0e4h,004h,000h
+DB	000h,0d4h,005h,000h
+DB	000h,0c4h,006h,000h
+DB	000h,034h,007h,000h
+DB	000h,054h,008h,000h
+DB	000h,074h,00ah,000h
+DB	000h,064h,00bh,000h
+DB	000h,082h
+DB	000h,000h
+$L$SEH_info_mulx_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,003h,000h
+DB	000h,0e4h,004h,000h
+DB	000h,0d4h,005h,000h
+DB	000h,0c4h,006h,000h
+DB	000h,034h,007h,000h
+DB	000h,054h,008h,000h
+DB	000h,074h,00ah,000h
+DB	000h,064h,00bh,000h
+DB	000h,082h
+DB	000h,000h
+$L$SEH_info_sqrx_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_n_mul_mont_384_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_n_mul_mont_384_body::
+DB	1,0,17,0
+DB	000h,0f4h,005h,000h
+DB	000h,0e4h,006h,000h
+DB	000h,0d4h,007h,000h
+DB	000h,0c4h,008h,000h
+DB	000h,034h,009h,000h
+DB	000h,054h,00ah,000h
+DB	000h,074h,00ch,000h
+DB	000h,064h,00dh,000h
+DB	000h,0a2h
+DB	000h,000h
+$L$SEH_info_sqrx_n_mul_mont_384_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_n_mul_mont_383_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_n_mul_mont_383_body::
+DB	1,0,17,0
+DB	000h,0f4h,005h,000h
+DB	000h,0e4h,006h,000h
+DB	000h,0d4h,007h,000h
+DB	000h,0c4h,008h,000h
+DB	000h,034h,009h,000h
+DB	000h,054h,00ah,000h
+DB	000h,074h,00ch,000h
+DB	000h,064h,00dh,000h
+DB	000h,0a2h
+DB	000h,000h
+$L$SEH_info_sqrx_n_mul_mont_383_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_sqrx_mont_382x_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_sqrx_mont_382x_body::
+DB	1,0,18,0
+DB	000h,0f4h,011h,000h
+DB	000h,0e4h,012h,000h
+DB	000h,0d4h,013h,000h
+DB	000h,0c4h,014h,000h
+DB	000h,034h,015h,000h
+DB	000h,054h,016h,000h
+DB	000h,074h,018h,000h
+DB	000h,064h,019h,000h
+DB	000h,001h,017h,000h
+$L$SEH_info_sqrx_mont_382x_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm
new file mode 100644
index 00000000000..0e0c54cb65b
--- /dev/null
+++ b/crypto/blst_src/build/win64/sha256-armv8.asm
@@ -0,0 +1,1078 @@
+//
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+// project.
+// ====================================================================
+//
+// sha256_block procedure for ARMv8.
+//
+// This module is stripped of scalar code paths, with raionale that all
+// known processors are NEON-capable.
+//
+// See original module at CRYPTOGAMS for further details.
+
+	AREA	|.text|,CODE,ALIGN=8,ARM64
+
+	ALIGN	64
+
+|$LK256|
+	DCDU	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	DCDU	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	DCDU	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	DCDU	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	DCDU	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	DCDU	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	DCDU	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	DCDU	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	DCDU	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	DCDU	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	DCDU	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	DCDU	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	DCDU	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	DCDU	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	DCDU	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	DCDU	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	DCDU	0	//terminator
+
+	DCB	"SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0
+	ALIGN	4
+	ALIGN	4
+
+	EXPORT	|blst_sha256_block_armv8|[FUNC]
+	ALIGN	64
+|blst_sha256_block_armv8| PROC
+|$Lv8_entry|
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adr	x3,|$LK256|
+
+|$Loop_hw|
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	DCDU	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	DCDU	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+	DCDU	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+	DCDU	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	DCDU	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	DCDU	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+	DCDU	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+	DCDU	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	DCDU	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	DCDU	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+	DCDU	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+	DCDU	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+	DCDU	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+	DCDU	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+	DCDU	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+	DCDU	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+	DCDU	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,|$Loop_hw|
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+	ENDP
+
+	EXPORT	|blst_sha256_block_data_order|[FUNC]
+	ALIGN	16
+|blst_sha256_block_data_order| PROC
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+	sub	sp,sp,#16*4
+
+	adr	x16,|$LK256|
+	add	x2,x1,x2,lsl#6	// len to point at the end of inp
+
+	ld1	{v0.16b},[x1], #16
+	ld1	{v1.16b},[x1], #16
+	ld1	{v2.16b},[x1], #16
+	ld1	{v3.16b},[x1], #16
+	ld1	{v4.4s},[x16], #16
+	ld1	{v5.4s},[x16], #16
+	ld1	{v6.4s},[x16], #16
+	ld1	{v7.4s},[x16], #16
+	rev32	v0.16b,v0.16b		// yes, even on
+	rev32	v1.16b,v1.16b		// big-endian
+	rev32	v2.16b,v2.16b
+	rev32	v3.16b,v3.16b
+	mov	x17,sp
+	add	v4.4s,v4.4s,v0.4s
+	add	v5.4s,v5.4s,v1.4s
+	add	v6.4s,v6.4s,v2.4s
+	st1	{v4.4s,v5.4s},[x17], #32
+	add	v7.4s,v7.4s,v3.4s
+	st1	{v6.4s,v7.4s},[x17]
+	sub	x17,x17,#32
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#8]
+	ldp	w7,w8,[x0,#16]
+	ldp	w9,w10,[x0,#24]
+	ldr	w12,[sp,#0]
+	mov	w13,wzr
+	eor	w14,w4,w5
+	mov	w15,wzr
+	b	|$L_00_48|
+
+	ALIGN	16
+|$L_00_48|
+	ext8	v4.16b,v0.16b,v1.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext8	v7.16b,v2.16b,v3.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v3.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v0.4s,v0.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v0.4s,v0.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v0.4s,v0.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v0.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v0.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v0.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v0.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v0.4s,#13
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v0.4s,v0.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v0.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext8	v4.16b,v1.16b,v2.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext8	v7.16b,v3.16b,v0.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v0.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v1.4s,v1.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v1.4s,v1.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v1.4s,v1.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v1.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v1.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v1.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v1.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v1.4s,#13
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v1.4s,v1.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v1.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	ext8	v4.16b,v2.16b,v3.16b,#4
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	bic	w15,w9,w7
+	ext8	v7.16b,v0.16b,v1.16b,#4
+	eor	w11,w7,w7,ror#5
+	add	w3,w3,w13
+	mov	d19,v1.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w3,w3,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w10,w10,w12
+	add	v2.4s,v2.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w4
+	ushr	v16.4s,v19.4s,#17
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	add	v2.4s,v2.4s,v5.4s
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	eor	v17.16b,v17.16b,v7.16b
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	add	v2.4s,v2.4s,v17.4s
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	ushr	v18.4s,v2.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v2.4s,#10
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	sli	v18.4s,v2.4s,#15
+	add	w8,w8,w12
+	ushr	v17.4s,v2.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	sli	v17.4s,v2.4s,#13
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w4,w4,w8
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w10
+	eor	v17.16b,v17.16b,v17.16b
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	add	v2.4s,v2.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	v4.4s,v4.4s,v2.4s
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	ext8	v4.16b,v3.16b,v0.16b,#4
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	bic	w15,w5,w3
+	ext8	v7.16b,v1.16b,v2.16b,#4
+	eor	w11,w3,w3,ror#5
+	add	w7,w7,w13
+	mov	d19,v2.d[1]
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	ushr	v6.4s,v4.4s,#7
+	eor	w15,w7,w7,ror#11
+	ushr	v5.4s,v4.4s,#3
+	add	w6,w6,w12
+	add	v3.4s,v3.4s,v7.4s
+	ror	w11,w11,#6
+	sli	v6.4s,v4.4s,#25
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	ushr	v7.4s,v4.4s,#18
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	eor	v5.16b,v5.16b,v6.16b
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	sli	v7.4s,v4.4s,#14
+	eor	w14,w14,w8
+	ushr	v16.4s,v19.4s,#17
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	eor	v5.16b,v5.16b,v7.16b
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	sli	v16.4s,v19.4s,#15
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	ushr	v17.4s,v19.4s,#10
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	ushr	v7.4s,v19.4s,#19
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	add	v3.4s,v3.4s,v5.4s
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	sli	v7.4s,v19.4s,#13
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	eor	v17.16b,v17.16b,v16.16b
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	eor	v17.16b,v17.16b,v7.16b
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	add	v3.4s,v3.4s,v17.4s
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	ushr	v18.4s,v3.4s,#17
+	orr	w12,w12,w15
+	ushr	v19.4s,v3.4s,#10
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	sli	v18.4s,v3.4s,#15
+	add	w4,w4,w12
+	ushr	v17.4s,v3.4s,#19
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	v19.16b,v19.16b,v18.16b
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	sli	v17.4s,v3.4s,#13
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	ld1	{v4.4s},[x16], #16
+	add	w8,w8,w4
+	eor	v19.16b,v19.16b,v17.16b
+	eor	w14,w14,w6
+	eor	v17.16b,v17.16b,v17.16b
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	mov	v17.d[1],v19.d[0]
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	add	v3.4s,v3.4s,v17.4s
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	v4.4s,v4.4s,v3.4s
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[x16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	cmp	w12,#0				// check for K256 terminator
+	ldr	w12,[sp,#0]
+	sub	x17,x17,#64
+	bne	|$L_00_48|
+
+	sub	x16,x16,#256		// rewind x16
+	cmp	x1,x2
+	mov	x17, #64
+	cseleq	x17,x17,xzr
+	sub	x1,x1,x17			// avoid SEGV
+	mov	x17,sp
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v0.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v0.16b,v0.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v0.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#4]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#8]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#12]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#16]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v1.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v1.16b,v1.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v1.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#20]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#24]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#28]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	ldr	w12,[sp,#32]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w10,w10,w12
+	add	w3,w3,w15
+	and	w12,w8,w7
+	ld1	{v2.16b},[x1],#16
+	bic	w15,w9,w7
+	eor	w11,w7,w7,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w3,w3,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w7,ror#19
+	eor	w15,w3,w3,ror#11
+	rev32	v2.16b,v2.16b
+	add	w10,w10,w12
+	ror	w11,w11,#6
+	eor	w13,w3,w4
+	eor	w15,w15,w3,ror#20
+	add	v4.4s,v4.4s,v2.4s
+	add	w10,w10,w11
+	ldr	w12,[sp,#36]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w6,w6,w10
+	eor	w14,w14,w4
+	add	w9,w9,w12
+	add	w10,w10,w15
+	and	w12,w7,w6
+	bic	w15,w8,w6
+	eor	w11,w6,w6,ror#5
+	add	w10,w10,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w6,ror#19
+	eor	w15,w10,w10,ror#11
+	add	w9,w9,w12
+	ror	w11,w11,#6
+	eor	w14,w10,w3
+	eor	w15,w15,w10,ror#20
+	add	w9,w9,w11
+	ldr	w12,[sp,#40]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w5,w5,w9
+	eor	w13,w13,w3
+	add	w8,w8,w12
+	add	w9,w9,w15
+	and	w12,w6,w5
+	bic	w15,w7,w5
+	eor	w11,w5,w5,ror#5
+	add	w9,w9,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w5,ror#19
+	eor	w15,w9,w9,ror#11
+	add	w8,w8,w12
+	ror	w11,w11,#6
+	eor	w13,w9,w10
+	eor	w15,w15,w9,ror#20
+	add	w8,w8,w11
+	ldr	w12,[sp,#44]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w4,w4,w8
+	eor	w14,w14,w10
+	add	w7,w7,w12
+	add	w8,w8,w15
+	and	w12,w5,w4
+	bic	w15,w6,w4
+	eor	w11,w4,w4,ror#5
+	add	w8,w8,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w4,ror#19
+	eor	w15,w8,w8,ror#11
+	add	w7,w7,w12
+	ror	w11,w11,#6
+	eor	w14,w8,w9
+	eor	w15,w15,w8,ror#20
+	add	w7,w7,w11
+	ldr	w12,[sp,#48]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w3,w3,w7
+	eor	w13,w13,w9
+	st1	{v4.4s},[x17], #16
+	add	w6,w6,w12
+	add	w7,w7,w15
+	and	w12,w4,w3
+	ld1	{v3.16b},[x1],#16
+	bic	w15,w5,w3
+	eor	w11,w3,w3,ror#5
+	ld1	{v4.4s},[x16],#16
+	add	w7,w7,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w3,ror#19
+	eor	w15,w7,w7,ror#11
+	rev32	v3.16b,v3.16b
+	add	w6,w6,w12
+	ror	w11,w11,#6
+	eor	w13,w7,w8
+	eor	w15,w15,w7,ror#20
+	add	v4.4s,v4.4s,v3.4s
+	add	w6,w6,w11
+	ldr	w12,[sp,#52]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w10,w10,w6
+	eor	w14,w14,w8
+	add	w5,w5,w12
+	add	w6,w6,w15
+	and	w12,w3,w10
+	bic	w15,w4,w10
+	eor	w11,w10,w10,ror#5
+	add	w6,w6,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w10,ror#19
+	eor	w15,w6,w6,ror#11
+	add	w5,w5,w12
+	ror	w11,w11,#6
+	eor	w14,w6,w7
+	eor	w15,w15,w6,ror#20
+	add	w5,w5,w11
+	ldr	w12,[sp,#56]
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w9,w9,w5
+	eor	w13,w13,w7
+	add	w4,w4,w12
+	add	w5,w5,w15
+	and	w12,w10,w9
+	bic	w15,w3,w9
+	eor	w11,w9,w9,ror#5
+	add	w5,w5,w13
+	orr	w12,w12,w15
+	eor	w11,w11,w9,ror#19
+	eor	w15,w5,w5,ror#11
+	add	w4,w4,w12
+	ror	w11,w11,#6
+	eor	w13,w5,w6
+	eor	w15,w15,w5,ror#20
+	add	w4,w4,w11
+	ldr	w12,[sp,#60]
+	and	w14,w14,w13
+	ror	w15,w15,#2
+	add	w8,w8,w4
+	eor	w14,w14,w6
+	add	w3,w3,w12
+	add	w4,w4,w15
+	and	w12,w9,w8
+	bic	w15,w10,w8
+	eor	w11,w8,w8,ror#5
+	add	w4,w4,w14
+	orr	w12,w12,w15
+	eor	w11,w11,w8,ror#19
+	eor	w15,w4,w4,ror#11
+	add	w3,w3,w12
+	ror	w11,w11,#6
+	eor	w14,w4,w5
+	eor	w15,w15,w4,ror#20
+	add	w3,w3,w11
+	and	w13,w13,w14
+	ror	w15,w15,#2
+	add	w7,w7,w3
+	eor	w13,w13,w5
+	st1	{v4.4s},[x17], #16
+	add	w3,w3,w15			// h+=Sigma0(a) from the past
+	ldp	w11,w12,[x0,#0]
+	add	w3,w3,w13			// h+=Maj(a,b,c) from the past
+	ldp	w13,w14,[x0,#8]
+	add	w3,w3,w11			// accumulate
+	add	w4,w4,w12
+	ldp	w11,w12,[x0,#16]
+	add	w5,w5,w13
+	add	w6,w6,w14
+	ldp	w13,w14,[x0,#24]
+	add	w7,w7,w11
+	add	w8,w8,w12
+	ldr	w12,[sp,#0]
+	stp	w3,w4,[x0,#0]
+	add	w9,w9,w13
+	mov	w13,wzr
+	stp	w5,w6,[x0,#8]
+	add	w10,w10,w14
+	stp	w7,w8,[x0,#16]
+	eor	w14,w4,w5
+	stp	w9,w10,[x0,#24]
+	mov	w15,wzr
+	mov	x17,sp
+	bne	|$L_00_48|
+
+	ldr	x29,[x29]
+	add	sp,sp,#16*4+16
+	ret
+	ENDP
+
+
+	EXPORT	|blst_sha256_emit|[FUNC]
+	ALIGN	16
+|blst_sha256_emit| PROC
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+#ifndef	__AARCH64EB__
+	rev	x4,x4
+	rev	x5,x5
+	rev	x6,x6
+	rev	x7,x7
+#endif
+	str	w4,[x0,#4]
+	lsr	x4,x4,#32
+	str	w5,[x0,#12]
+	lsr	x5,x5,#32
+	str	w6,[x0,#20]
+	lsr	x6,x6,#32
+	str	w7,[x0,#28]
+	lsr	x7,x7,#32
+	str	w4,[x0,#0]
+	str	w5,[x0,#8]
+	str	w6,[x0,#16]
+	str	w7,[x0,#24]
+	ret
+	ENDP
+
+
+
+	EXPORT	|blst_sha256_bcopy|[FUNC]
+	ALIGN	16
+|blst_sha256_bcopy| PROC
+|$Loop_bcopy|
+	ldrb	w3,[x1],#1
+	sub	x2,x2,#1
+	strb	w3,[x0],#1
+	cbnz	x2,|$Loop_bcopy|
+	ret
+	ENDP
+
+
+
+	EXPORT	|blst_sha256_hcopy|[FUNC]
+	ALIGN	16
+|blst_sha256_hcopy| PROC
+	ldp	x4,x5,[x1]
+	ldp	x6,x7,[x1,#16]
+	stp	x4,x5,[x0]
+	stp	x6,x7,[x0,#16]
+	ret
+	ENDP
+	END
diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm
new file mode 100644
index 00000000000..d3b409235e7
--- /dev/null
+++ b/crypto/blst_src/build/win64/sha256-x86_64.asm
@@ -0,0 +1,1570 @@
+OPTION	DOTNAME
+.text$	SEGMENT ALIGN(256) 'CODE'
+
+ALIGN	64
+
+K256::
+	DD	0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h
+	DD	03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h
+	DD	0d807aa98h,012835b01h,0243185beh,0550c7dc3h
+	DD	072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h
+	DD	0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch
+	DD	02de92c6fh,04a7484aah,05cb0a9dch,076f988dah
+	DD	0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h
+	DD	0c6e00bf3h,0d5a79147h,006ca6351h,014292967h
+	DD	027b70a85h,02e1b2138h,04d2c6dfch,053380d13h
+	DD	0650a7354h,0766a0abbh,081c2c92eh,092722c85h
+	DD	0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h
+	DD	0d192e819h,0d6990624h,0f40e3585h,0106aa070h
+	DD	019a4c116h,01e376c08h,02748774ch,034b0bcb5h
+	DD	0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h
+	DD	0748f82eeh,078a5636fh,084c87814h,08cc70208h
+	DD	090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h
+
+	DD	000010203h,004050607h,008090a0bh,00c0d0e0fh
+	DD	003020100h,00b0a0908h,0ffffffffh,0ffffffffh
+	DD	0ffffffffh,0ffffffffh,003020100h,00b0a0908h
+DB	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+DB	110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
+DB	52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+DB	32,64,100,111,116,45,97,115,109,0
+PUBLIC	blst_sha256_block_data_order_shaext
+
+
+ALIGN	64
+blst_sha256_block_data_order_shaext	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_blst_sha256_block_data_order_shaext::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	sub	rsp,058h
+
+	movaps	XMMWORD PTR[(-88)+r11],xmm6
+
+	movaps	XMMWORD PTR[(-72)+r11],xmm7
+
+	movaps	XMMWORD PTR[(-56)+r11],xmm8
+
+	movaps	XMMWORD PTR[(-40)+r11],xmm9
+
+	movaps	XMMWORD PTR[(-24)+r11],xmm10
+
+$L$SEH_body_blst_sha256_block_data_order_shaext::
+
+	lea	rcx,QWORD PTR[((K256+128))]
+	movdqu	xmm1,XMMWORD PTR[rdi]
+	movdqu	xmm2,XMMWORD PTR[16+rdi]
+	movdqa	xmm7,XMMWORD PTR[((256-128))+rcx]
+
+	pshufd	xmm0,xmm1,01bh
+	pshufd	xmm1,xmm1,0b1h
+	pshufd	xmm2,xmm2,01bh
+	movdqa	xmm8,xmm7
+DB	102,15,58,15,202,8
+	punpcklqdq	xmm2,xmm0
+	jmp	$L$oop_shaext
+
+ALIGN	16
+$L$oop_shaext::
+	movdqu	xmm3,XMMWORD PTR[rsi]
+	movdqu	xmm4,XMMWORD PTR[16+rsi]
+	movdqu	xmm5,XMMWORD PTR[32+rsi]
+DB	102,15,56,0,223
+	movdqu	xmm6,XMMWORD PTR[48+rsi]
+
+	movdqa	xmm0,XMMWORD PTR[((0-128))+rcx]
+	paddd	xmm0,xmm3
+DB	102,15,56,0,231
+	movdqa	xmm10,xmm2
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	nop
+	movdqa	xmm9,xmm1
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD PTR[((16-128))+rcx]
+	paddd	xmm0,xmm4
+DB	102,15,56,0,239
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	lea	rsi,QWORD PTR[64+rsi]
+DB	15,56,204,220
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD PTR[((32-128))+rcx]
+	paddd	xmm0,xmm5
+DB	102,15,56,0,247
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD PTR[((48-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((64-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((80-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+DB	15,56,204,220
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((96-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,205,245
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((112-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((128-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((144-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+DB	15,56,204,220
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((160-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,205,245
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((176-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((192-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD PTR[((208-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+DB	15,56,203,202
+	paddd	xmm6,xmm7
+
+	movdqa	xmm0,XMMWORD PTR[((224-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+DB	15,56,205,245
+	movdqa	xmm7,xmm8
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD PTR[((240-128))+rcx]
+	paddd	xmm0,xmm6
+	nop
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,00eh
+	dec	rdx
+	nop
+DB	15,56,203,202
+
+	paddd	xmm2,xmm10
+	paddd	xmm1,xmm9
+	jnz	$L$oop_shaext
+
+	pshufd	xmm2,xmm2,0b1h
+	pshufd	xmm7,xmm1,01bh
+	pshufd	xmm1,xmm1,0b1h
+	punpckhqdq	xmm1,xmm2
+DB	102,15,58,15,215,8
+
+	movdqu	XMMWORD PTR[rdi],xmm1
+	movdqu	XMMWORD PTR[16+rdi],xmm2
+	movaps	xmm6,XMMWORD PTR[((-88))+r11]
+	movaps	xmm7,XMMWORD PTR[((-72))+r11]
+	movaps	xmm8,XMMWORD PTR[((-56))+r11]
+	movaps	xmm9,XMMWORD PTR[((-40))+r11]
+	movaps	xmm10,XMMWORD PTR[((-24))+r11]
+	mov	rsp,r11
+
+$L$SEH_epilogue_blst_sha256_block_data_order_shaext::
+	mov	rdi,QWORD PTR[8+r11]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+r11]
+
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_blst_sha256_block_data_order_shaext::
+blst_sha256_block_data_order_shaext	ENDP
+PUBLIC	blst_sha256_block_data_order
+
+
+ALIGN	64
+blst_sha256_block_data_order	PROC PUBLIC
+	DB	243,15,30,250
+	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD PTR[16+rsp],rsi
+	mov	r11,rsp
+$L$SEH_begin_blst_sha256_block_data_order::
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+	push	rbp
+
+	push	rbx
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+	shl	rdx,4
+	sub	rsp,104
+
+	lea	rdx,QWORD PTR[rdx*4+rsi]
+	mov	QWORD PTR[rsp],rdi
+
+	mov	QWORD PTR[16+rsp],rdx
+	movaps	XMMWORD PTR[32+rsp],xmm6
+
+	movaps	XMMWORD PTR[48+rsp],xmm7
+
+	movaps	XMMWORD PTR[64+rsp],xmm8
+
+	movaps	XMMWORD PTR[80+rsp],xmm9
+
+	mov	rbp,rsp
+
+$L$SEH_body_blst_sha256_block_data_order::
+
+
+	lea	rsp,QWORD PTR[((-64))+rsp]
+	mov	eax,DWORD PTR[rdi]
+	and	rsp,-64
+	mov	ebx,DWORD PTR[4+rdi]
+	mov	ecx,DWORD PTR[8+rdi]
+	mov	edx,DWORD PTR[12+rdi]
+	mov	r8d,DWORD PTR[16+rdi]
+	mov	r9d,DWORD PTR[20+rdi]
+	mov	r10d,DWORD PTR[24+rdi]
+	mov	r11d,DWORD PTR[28+rdi]
+
+
+	jmp	$L$loop_ssse3
+ALIGN	16
+$L$loop_ssse3::
+	movdqa	xmm7,XMMWORD PTR[((K256+256))]
+	mov	QWORD PTR[8+rbp],rsi
+	movdqu	xmm0,XMMWORD PTR[rsi]
+	movdqu	xmm1,XMMWORD PTR[16+rsi]
+	movdqu	xmm2,XMMWORD PTR[32+rsi]
+DB	102,15,56,0,199
+	movdqu	xmm3,XMMWORD PTR[48+rsi]
+	lea	rsi,QWORD PTR[K256]
+DB	102,15,56,0,207
+	movdqa	xmm4,XMMWORD PTR[rsi]
+	movdqa	xmm5,XMMWORD PTR[16+rsi]
+DB	102,15,56,0,215
+	paddd	xmm4,xmm0
+	movdqa	xmm6,XMMWORD PTR[32+rsi]
+DB	102,15,56,0,223
+	movdqa	xmm7,XMMWORD PTR[48+rsi]
+	paddd	xmm5,xmm1
+	paddd	xmm6,xmm2
+	paddd	xmm7,xmm3
+	movdqa	XMMWORD PTR[rsp],xmm4
+	mov	r14d,eax
+	movdqa	XMMWORD PTR[16+rsp],xmm5
+	mov	edi,ebx
+	movdqa	XMMWORD PTR[32+rsp],xmm6
+	xor	edi,ecx
+	movdqa	XMMWORD PTR[48+rsp],xmm7
+	mov	r13d,r8d
+	jmp	$L$ssse3_00_47
+
+ALIGN	16
+$L$ssse3_00_47::
+	sub	rsi,-64
+	ror	r13d,14
+	movdqa	xmm4,xmm1
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm3
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,224,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,250,4
+	add	r11d,DWORD PTR[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm3,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD PTR[4+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm0,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD PTR[8+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm0,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm0,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD PTR[12+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD PTR[rsi]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm0,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm0
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD PTR[rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm2
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm0
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,225,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,251,4
+	add	edx,DWORD PTR[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm0,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD PTR[20+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm1,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD PTR[24+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm1,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm1,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD PTR[28+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD PTR[16+rsi]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm1,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm1
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD PTR[16+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm3
+	mov	eax,r14d
+	mov	r12d,r9d
+	movdqa	xmm7,xmm1
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+DB	102,15,58,15,226,4
+	and	r12d,r8d
+	xor	r13d,r8d
+DB	102,15,58,15,248,4
+	add	r11d,DWORD PTR[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,ebx
+	add	r11d,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	edx,r11d
+	psrld	xmm6,7
+	add	r11d,edi
+	mov	r13d,edx
+	pshufd	xmm7,xmm1,250
+	add	r14d,r11d
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,r11d
+	pxor	xmm4,xmm5
+	and	r12d,edx
+	xor	r13d,edx
+	pslld	xmm5,11
+	add	r10d,DWORD PTR[36+rsp]
+	mov	edi,r11d
+	pxor	xmm4,xmm6
+	xor	r12d,r9d
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,eax
+	add	r10d,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	psrld	xmm7,10
+	add	r10d,r13d
+	xor	r15d,eax
+	paddd	xmm2,xmm4
+	ror	r14d,2
+	add	ecx,r10d
+	psrlq	xmm6,17
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,ecx
+	xor	r12d,r8d
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	pshufd	xmm7,xmm7,128
+	xor	r13d,ecx
+	add	r9d,DWORD PTR[40+rsp]
+	mov	r15d,r10d
+	psrldq	xmm7,8
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	paddd	xmm2,xmm7
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	pshufd	xmm7,xmm2,80
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	movdqa	xmm6,xmm7
+	add	r9d,edi
+	mov	r13d,ebx
+	psrld	xmm7,10
+	add	r14d,r9d
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	r9d,r14d
+	mov	r12d,ecx
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	psrlq	xmm6,2
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD PTR[44+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,r10d
+	add	r8d,r12d
+	movdqa	xmm6,XMMWORD PTR[32+rsi]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	paddd	xmm2,xmm7
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	paddd	xmm6,xmm2
+	mov	r13d,eax
+	add	r14d,r8d
+	movdqa	XMMWORD PTR[32+rsp],xmm6
+	ror	r13d,14
+	movdqa	xmm4,xmm0
+	mov	r8d,r14d
+	mov	r12d,ebx
+	movdqa	xmm7,xmm2
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+DB	102,15,58,15,227,4
+	and	r12d,eax
+	xor	r13d,eax
+DB	102,15,58,15,249,4
+	add	edx,DWORD PTR[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	movdqa	xmm5,xmm4
+	xor	r15d,r9d
+	add	edx,r12d
+	movdqa	xmm6,xmm4
+	ror	r13d,6
+	and	edi,r15d
+	psrld	xmm4,3
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r11d,edx
+	psrld	xmm6,7
+	add	edx,edi
+	mov	r13d,r11d
+	pshufd	xmm7,xmm2,250
+	add	r14d,edx
+	ror	r13d,14
+	pslld	xmm5,14
+	mov	edx,r14d
+	mov	r12d,eax
+	pxor	xmm4,xmm6
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	psrld	xmm6,11
+	xor	r14d,edx
+	pxor	xmm4,xmm5
+	and	r12d,r11d
+	xor	r13d,r11d
+	pslld	xmm5,11
+	add	ecx,DWORD PTR[52+rsp]
+	mov	edi,edx
+	pxor	xmm4,xmm6
+	xor	r12d,ebx
+	ror	r14d,11
+	movdqa	xmm6,xmm7
+	xor	edi,r8d
+	add	ecx,r12d
+	pxor	xmm4,xmm5
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	psrld	xmm7,10
+	add	ecx,r13d
+	xor	r15d,r8d
+	paddd	xmm3,xmm4
+	ror	r14d,2
+	add	r10d,ecx
+	psrlq	xmm6,17
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	pxor	xmm7,xmm6
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	psrlq	xmm6,2
+	xor	r13d,r10d
+	xor	r12d,eax
+	pxor	xmm7,xmm6
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	pshufd	xmm7,xmm7,128
+	xor	r13d,r10d
+	add	ebx,DWORD PTR[56+rsp]
+	mov	r15d,ecx
+	psrldq	xmm7,8
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	paddd	xmm3,xmm7
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	pshufd	xmm7,xmm3,80
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	movdqa	xmm6,xmm7
+	add	ebx,edi
+	mov	r13d,r9d
+	psrld	xmm7,10
+	add	r14d,ebx
+	ror	r13d,14
+	psrlq	xmm6,17
+	mov	ebx,r14d
+	mov	r12d,r10d
+	pxor	xmm7,xmm6
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	psrlq	xmm6,2
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD PTR[60+rsp]
+	pxor	xmm7,xmm6
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	pshufd	xmm7,xmm7,8
+	xor	edi,ecx
+	add	eax,r12d
+	movdqa	xmm6,XMMWORD PTR[48+rsi]
+	ror	r13d,6
+	and	r15d,edi
+	pslldq	xmm7,8
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	paddd	xmm3,xmm7
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	paddd	xmm6,xmm3
+	mov	r13d,r8d
+	add	r14d,eax
+	movdqa	XMMWORD PTR[48+rsp],xmm6
+	cmp	BYTE PTR[67+rsi],0
+	jne	$L$ssse3_00_47
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD PTR[rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD PTR[4+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD PTR[8+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD PTR[12+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD PTR[16+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD PTR[20+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD PTR[24+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD PTR[28+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	ror	r13d,14
+	mov	eax,r14d
+	mov	r12d,r9d
+	ror	r14d,9
+	xor	r13d,r8d
+	xor	r12d,r10d
+	ror	r13d,5
+	xor	r14d,eax
+	and	r12d,r8d
+	xor	r13d,r8d
+	add	r11d,DWORD PTR[32+rsp]
+	mov	r15d,eax
+	xor	r12d,r10d
+	ror	r14d,11
+	xor	r15d,ebx
+	add	r11d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,eax
+	add	r11d,r13d
+	xor	edi,ebx
+	ror	r14d,2
+	add	edx,r11d
+	add	r11d,edi
+	mov	r13d,edx
+	add	r14d,r11d
+	ror	r13d,14
+	mov	r11d,r14d
+	mov	r12d,r8d
+	ror	r14d,9
+	xor	r13d,edx
+	xor	r12d,r9d
+	ror	r13d,5
+	xor	r14d,r11d
+	and	r12d,edx
+	xor	r13d,edx
+	add	r10d,DWORD PTR[36+rsp]
+	mov	edi,r11d
+	xor	r12d,r9d
+	ror	r14d,11
+	xor	edi,eax
+	add	r10d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r11d
+	add	r10d,r13d
+	xor	r15d,eax
+	ror	r14d,2
+	add	ecx,r10d
+	add	r10d,r15d
+	mov	r13d,ecx
+	add	r14d,r10d
+	ror	r13d,14
+	mov	r10d,r14d
+	mov	r12d,edx
+	ror	r14d,9
+	xor	r13d,ecx
+	xor	r12d,r8d
+	ror	r13d,5
+	xor	r14d,r10d
+	and	r12d,ecx
+	xor	r13d,ecx
+	add	r9d,DWORD PTR[40+rsp]
+	mov	r15d,r10d
+	xor	r12d,r8d
+	ror	r14d,11
+	xor	r15d,r11d
+	add	r9d,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r10d
+	add	r9d,r13d
+	xor	edi,r11d
+	ror	r14d,2
+	add	ebx,r9d
+	add	r9d,edi
+	mov	r13d,ebx
+	add	r14d,r9d
+	ror	r13d,14
+	mov	r9d,r14d
+	mov	r12d,ecx
+	ror	r14d,9
+	xor	r13d,ebx
+	xor	r12d,edx
+	ror	r13d,5
+	xor	r14d,r9d
+	and	r12d,ebx
+	xor	r13d,ebx
+	add	r8d,DWORD PTR[44+rsp]
+	mov	edi,r9d
+	xor	r12d,edx
+	ror	r14d,11
+	xor	edi,r10d
+	add	r8d,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,r9d
+	add	r8d,r13d
+	xor	r15d,r10d
+	ror	r14d,2
+	add	eax,r8d
+	add	r8d,r15d
+	mov	r13d,eax
+	add	r14d,r8d
+	ror	r13d,14
+	mov	r8d,r14d
+	mov	r12d,ebx
+	ror	r14d,9
+	xor	r13d,eax
+	xor	r12d,ecx
+	ror	r13d,5
+	xor	r14d,r8d
+	and	r12d,eax
+	xor	r13d,eax
+	add	edx,DWORD PTR[48+rsp]
+	mov	r15d,r8d
+	xor	r12d,ecx
+	ror	r14d,11
+	xor	r15d,r9d
+	add	edx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,r8d
+	add	edx,r13d
+	xor	edi,r9d
+	ror	r14d,2
+	add	r11d,edx
+	add	edx,edi
+	mov	r13d,r11d
+	add	r14d,edx
+	ror	r13d,14
+	mov	edx,r14d
+	mov	r12d,eax
+	ror	r14d,9
+	xor	r13d,r11d
+	xor	r12d,ebx
+	ror	r13d,5
+	xor	r14d,edx
+	and	r12d,r11d
+	xor	r13d,r11d
+	add	ecx,DWORD PTR[52+rsp]
+	mov	edi,edx
+	xor	r12d,ebx
+	ror	r14d,11
+	xor	edi,r8d
+	add	ecx,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,edx
+	add	ecx,r13d
+	xor	r15d,r8d
+	ror	r14d,2
+	add	r10d,ecx
+	add	ecx,r15d
+	mov	r13d,r10d
+	add	r14d,ecx
+	ror	r13d,14
+	mov	ecx,r14d
+	mov	r12d,r11d
+	ror	r14d,9
+	xor	r13d,r10d
+	xor	r12d,eax
+	ror	r13d,5
+	xor	r14d,ecx
+	and	r12d,r10d
+	xor	r13d,r10d
+	add	ebx,DWORD PTR[56+rsp]
+	mov	r15d,ecx
+	xor	r12d,eax
+	ror	r14d,11
+	xor	r15d,edx
+	add	ebx,r12d
+	ror	r13d,6
+	and	edi,r15d
+	xor	r14d,ecx
+	add	ebx,r13d
+	xor	edi,edx
+	ror	r14d,2
+	add	r9d,ebx
+	add	ebx,edi
+	mov	r13d,r9d
+	add	r14d,ebx
+	ror	r13d,14
+	mov	ebx,r14d
+	mov	r12d,r10d
+	ror	r14d,9
+	xor	r13d,r9d
+	xor	r12d,r11d
+	ror	r13d,5
+	xor	r14d,ebx
+	and	r12d,r9d
+	xor	r13d,r9d
+	add	eax,DWORD PTR[60+rsp]
+	mov	edi,ebx
+	xor	r12d,r11d
+	ror	r14d,11
+	xor	edi,ecx
+	add	eax,r12d
+	ror	r13d,6
+	and	r15d,edi
+	xor	r14d,ebx
+	add	eax,r13d
+	xor	r15d,ecx
+	ror	r14d,2
+	add	r8d,eax
+	add	eax,r15d
+	mov	r13d,r8d
+	add	r14d,eax
+	mov	rdi,QWORD PTR[rbp]
+	mov	eax,r14d
+	mov	rsi,QWORD PTR[8+rbp]
+
+	add	eax,DWORD PTR[rdi]
+	add	ebx,DWORD PTR[4+rdi]
+	add	ecx,DWORD PTR[8+rdi]
+	add	edx,DWORD PTR[12+rdi]
+	add	r8d,DWORD PTR[16+rdi]
+	add	r9d,DWORD PTR[20+rdi]
+	add	r10d,DWORD PTR[24+rdi]
+	add	r11d,DWORD PTR[28+rdi]
+
+	lea	rsi,QWORD PTR[64+rsi]
+	cmp	rsi,QWORD PTR[16+rbp]
+
+	mov	DWORD PTR[rdi],eax
+	mov	DWORD PTR[4+rdi],ebx
+	mov	DWORD PTR[8+rdi],ecx
+	mov	DWORD PTR[12+rdi],edx
+	mov	DWORD PTR[16+rdi],r8d
+	mov	DWORD PTR[20+rdi],r9d
+	mov	DWORD PTR[24+rdi],r10d
+	mov	DWORD PTR[28+rdi],r11d
+	jb	$L$loop_ssse3
+
+	xorps	xmm0,xmm0
+	lea	r11,QWORD PTR[((104+48))+rbp]
+
+	movaps	XMMWORD PTR[rsp],xmm0
+	movaps	XMMWORD PTR[16+rsp],xmm0
+	movaps	XMMWORD PTR[32+rsp],xmm0
+	movaps	XMMWORD PTR[48+rsp],xmm0
+	movaps	xmm6,XMMWORD PTR[32+rbp]
+	movaps	xmm7,XMMWORD PTR[48+rbp]
+	movaps	xmm8,XMMWORD PTR[64+rbp]
+	movaps	xmm9,XMMWORD PTR[80+rbp]
+	mov	r15,QWORD PTR[104+rbp]
+
+	mov	r14,QWORD PTR[((-40))+r11]
+
+	mov	r13,QWORD PTR[((-32))+r11]
+
+	mov	r12,QWORD PTR[((-24))+r11]
+
+	mov	rbx,QWORD PTR[((-16))+r11]
+
+	mov	rbp,QWORD PTR[((-8))+r11]
+
+$L$SEH_epilogue_blst_sha256_block_data_order::
+	mov	rdi,QWORD PTR[8+r11]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+r11]
+
+	lea	rsp,QWORD PTR[r11]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_blst_sha256_block_data_order::
+blst_sha256_block_data_order	ENDP
+PUBLIC	blst_sha256_emit
+
+
+ALIGN	16
+blst_sha256_emit	PROC PUBLIC
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rdx]
+	mov	r9,QWORD PTR[8+rdx]
+	mov	r10,QWORD PTR[16+rdx]
+	bswap	r8
+	mov	r11,QWORD PTR[24+rdx]
+	bswap	r9
+	mov	DWORD PTR[4+rcx],r8d
+	bswap	r10
+	mov	DWORD PTR[12+rcx],r9d
+	bswap	r11
+	mov	DWORD PTR[20+rcx],r10d
+	shr	r8,32
+	mov	DWORD PTR[28+rcx],r11d
+	shr	r9,32
+	mov	DWORD PTR[rcx],r8d
+	shr	r10,32
+	mov	DWORD PTR[8+rcx],r9d
+	shr	r11,32
+	mov	DWORD PTR[16+rcx],r10d
+	mov	DWORD PTR[24+rcx],r11d
+	DB	0F3h,0C3h		;repret
+blst_sha256_emit	ENDP
+
+PUBLIC	blst_sha256_bcopy
+
+
+ALIGN	16
+blst_sha256_bcopy	PROC PUBLIC
+	DB	243,15,30,250
+	sub	rcx,rdx
+$L$oop_bcopy::
+	movzx	eax,BYTE PTR[rdx]
+	lea	rdx,QWORD PTR[1+rdx]
+	mov	BYTE PTR[((-1))+rdx*1+rcx],al
+	dec	r8
+	jnz	$L$oop_bcopy
+	DB	0F3h,0C3h		;repret
+blst_sha256_bcopy	ENDP
+
+PUBLIC	blst_sha256_hcopy
+
+
+ALIGN	16
+blst_sha256_hcopy	PROC PUBLIC
+	DB	243,15,30,250
+	mov	r8,QWORD PTR[rdx]
+	mov	r9,QWORD PTR[8+rdx]
+	mov	r10,QWORD PTR[16+rdx]
+	mov	r11,QWORD PTR[24+rdx]
+	mov	QWORD PTR[rcx],r8
+	mov	QWORD PTR[8+rcx],r9
+	mov	QWORD PTR[16+rcx],r10
+	mov	QWORD PTR[24+rcx],r11
+	DB	0F3h,0C3h		;repret
+blst_sha256_hcopy	ENDP
+.text$	ENDS
+.pdata	SEGMENT READONLY ALIGN(4)
+ALIGN	4
+	DD	imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_body_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue
+
+	DD	imagerel $L$SEH_body_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body
+
+	DD	imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_end_blst_sha256_block_data_order_shaext
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue
+
+	DD	imagerel $L$SEH_begin_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_body_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_prologue
+
+	DD	imagerel $L$SEH_body_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_epilogue_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_body
+
+	DD	imagerel $L$SEH_epilogue_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_end_blst_sha256_block_data_order
+	DD	imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue
+
+.pdata	ENDS
+.xdata	SEGMENT READONLY ALIGN(8)
+ALIGN	8
+$L$SEH_info_blst_sha256_block_data_order_shaext_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_blst_sha256_block_data_order_shaext_body::
+DB	1,0,15,0
+DB	000h,068h,000h,000h
+DB	000h,078h,001h,000h
+DB	000h,088h,002h,000h
+DB	000h,098h,003h,000h
+DB	000h,0a8h,004h,000h
+DB	000h,074h,00ch,000h
+DB	000h,064h,00dh,000h
+DB	000h,0a2h
+DB	000h,000h,000h,000h,000h,000h
+$L$SEH_info_blst_sha256_block_data_order_shaext_epilogue::
+DB	1,0,5,11
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,003h
+DB	000h,000h
+
+$L$SEH_info_blst_sha256_block_data_order_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,003h
+DB	0,0
+$L$SEH_info_blst_sha256_block_data_order_body::
+DB	1,0,26,5
+DB	000h,068h,002h,000h
+DB	000h,078h,003h,000h
+DB	000h,088h,004h,000h
+DB	000h,098h,005h,000h
+DB	000h,0f4h,00dh,000h
+DB	000h,0e4h,00eh,000h
+DB	000h,0d4h,00fh,000h
+DB	000h,0c4h,010h,000h
+DB	000h,034h,011h,000h
+DB	000h,074h,014h,000h
+DB	000h,064h,015h,000h
+DB	000h,003h
+DB	000h,001h,012h,000h
+DB	000h,050h
+$L$SEH_info_blst_sha256_block_data_order_epilogue::
+DB	1,0,5,11
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,003h
+DB	000h,000h
+
+
+.xdata	ENDS
+END
diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c
new file mode 100644
index 00000000000..81afc530665
--- /dev/null
+++ b/crypto/blst_src/bulk_addition.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+#include "point.h"
+
+/*
+ * This implementation uses explicit addition formula:
+ *
+ * λ = (Y₂-Y₁)/(X₂-X₁)
+ * X₃ = λ²-(X₁+X₂)
+ * Y₃ = λ⋅(X₁-X₃)-Y₁
+ *
+ * But since we don't know if we'll have to add point to itself, we need
+ * to eventually resort to corresponding doubling formula:
+ *
+ * λ = 3X₁²/2Y₁
+ * X₃ = λ²-2X₁
+ * Y₃ = λ⋅(X₁-X₃)-Y₁
+ *
+ * The formulae use prohibitively expensive inversion, but whenever we
+ * have a lot of affine points to accumulate, we can amortize the cost
+ * by applying Montgomery's batch inversion approach. As a result,
+ * asymptotic[!] per-point cost for addition is as small as 5M+1S. For
+ * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things
+ * considered, the improvement coefficient varies from 60% to 85%
+ * depending on platform and curve.
+ *
+ * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an
+ * application that requires constant time-ness, speak up!]
+ */
+
+/*
+ * Calculate λ's numerator and denominator.
+ *
+ * input:	A	x1	y1	-
+ *		B	x2	y2	-
+ * output:
+ * if A!=B:	A	x1	y1	(x2-x1)*mul_acc
+ *		B	x2+x1	y2-y1	(x2-x1)
+ *
+ * if A==B:	A	x	y	2y*mul_acc
+ *		B	2x	3*x^2	2y
+ *
+ * if A==-B:	A	0	0	1*mul_acc
+ *		B	0	3*x^2	0
+ */
+#define HEAD(ptype, bits, field, one) \
+static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \
+{ \
+    ptype *A = AB, *B = AB+1; \
+    limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \
+                 vec_is_zero(B, sizeof(ptype##_affine));  \
+    static const vec##bits zero = { 0 }; \
+\
+    sub_##field(B->Z, B->X, A->X);		/* X2-X1  */ \
+    add_##field(B->X, B->X, A->X);		/* X2+X1  */ \
+    add_##field(A->Z, B->Y, A->Y);		/* Y2+Y1  */ \
+    sub_##field(B->Y, B->Y, A->Y);		/* Y2-Y1  */ \
+    if (vec_is_zero(B->Z, sizeof(B->Z))) {	/* X2==X1 */ \
+        inf = vec_is_zero(A->Z, sizeof(A->Z));	\
+        vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \
+        sqr_##field(B->Y, A->X);		\
+        mul_by_3_##field(B->Y, B->Y);		/* 3*X1^2 */ \
+        vec_copy(B->Z, A->Z, sizeof(B->Z));	/* 2*Y1   */ \
+    }						/* B->Y is numenator    */ \
+						/* B->Z is denominator  */ \
+    vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \
+    vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \
+    vec_select(A->Z, one,  B->Z, sizeof(A->Z), inf); \
+    vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \
+    if (mul_acc != NULL) \
+        mul_##field(A->Z, A->Z, mul_acc);	/* chain multiplication */\
+}
+
+/*
+ * Calculate λ and resulting coordinates.
+ *
+ * input:	A		x1			y1		-
+ *		B		x2+x1			nominator	-
+ * 		lambda		1/denominator
+ * output:	D		x3=(nom/den)^2-(x2+x1)	y3=(nom/den)(x1-x3)-y1
+ */
+#define TAIL(ptype, bits, field, one) \
+static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \
+{ \
+    ptype *A = AB, *B = AB+1; \
+    vec##bits llambda; \
+    limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \
+\
+    mul_##field(lambda, lambda, B->Y);		/* λ = (Y2-Y1)/(X2-X1)  */ \
+						/* alt. 3*X1^2/2*Y1     */ \
+    sqr_##field(llambda, lambda); \
+    sub_##field(D->X, llambda, B->X);		/* X3 = λ^2-X1-X2       */ \
+\
+    sub_##field(D->Y, A->X, D->X);   \
+    mul_##field(D->Y, D->Y, lambda); \
+    sub_##field(D->Y, D->Y, A->Y);		/* Y3 = λ*(X1-X3)-Y1    */ \
+\
+    vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \
+    vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \
+}
+
+/*
+ * |points[]| is volatile buffer with |X|s and |Y|s initially holding
+ * input affine coordinates, and with |Z|s being used as additional
+ * temporary storage [unrelated to Jacobian coordinates]. |sum| is
+ * in-/output, initialize to infinity accordingly.
+ */
+#define ADDITION_BTREE(prefix, ptype, bits, field, one) \
+HEAD(ptype, bits, field, one) \
+TAIL(ptype, bits, field, one) \
+static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \
+{ \
+    ptype *dst; \
+    void *mul_acc; \
+    size_t i; \
+\
+    while (n >= 16) { \
+        if (n & 1) \
+            ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
+        n /= 2; \
+        for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \
+            ptype##_head(points, mul_acc); \
+\
+        reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \
+\
+        for (dst = points, i = n; --i;) { \
+            dst--; points -= 2; \
+            mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \
+            ptype##_tail(dst, points, points[-2].Z); \
+            mul_##field(points[-2].Z, points[0].Z, points[1].Z); \
+        } \
+        dst--; points -= 2; \
+        ptype##_tail(dst, points, points[0].Z); \
+        points = dst; \
+    } \
+    while (n--) \
+        ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \
+} \
+\
+void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
+                               size_t npoints) \
+{ \
+    /* Performance with 288K scratch is within 1-2-3% from optimal */ \
+    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \
+    ptype *scratch = alloca((npoints > stride ? stride : npoints) * \
+                            sizeof(ptype)); \
+    const ptype##_affine *point = NULL; \
+\
+    vec_zero(sum, sizeof(*sum)); \
+    while (npoints) { \
+        size_t i, j = npoints > stride ? stride : npoints; \
+        for (i=0; i<j; i++) { \
+            point = *points ? *points++ : point+1; \
+            vec_copy(&scratch[i], point, sizeof(*point)); \
+        } \
+        ptype##s_accumulate(sum, scratch, j); \
+        npoints -= j; \
+    } \
+}
+
+ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)
+
+ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
diff --git a/crypto/blst_src/bytes.h b/crypto/blst_src/bytes.h
new file mode 100644
index 00000000000..af910ba8145
--- /dev/null
+++ b/crypto/blst_src/bytes.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_BYTES_H__
+#define __BLS12_381_ASM_BYTES_H__
+
+static inline void bytes_zero(unsigned char *a, size_t num)
+{
+    size_t i;
+
+    for (i = 0; i < num; i++)
+        a[i] = 0;
+}
+
+static inline void limbs_from_be_bytes(limb_t *restrict ret,
+                                       const unsigned char *in, size_t n)
+{
+    limb_t limb = 0;
+
+    while(n--) {
+        limb <<= 8;
+        limb |= *in++;
+        /*
+         * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
+         * to perform redundant stores than to pay penalty for
+         * mispredicted branch. Besides, some compilers unroll the
+         * loop and remove redundant stores to 'restict'-ed storage...
+         */
+        ret[n / sizeof(limb_t)] = limb;
+    }
+}
+
+static inline void be_bytes_from_limbs(unsigned char *out, const limb_t *in,
+                                       size_t n)
+{
+    limb_t limb;
+
+    while(n--) {
+        limb = in[n / sizeof(limb_t)];
+        *out++ = (unsigned char)(limb >> (8 * (n % sizeof(limb_t))));
+    }
+}
+
+static inline void limbs_from_le_bytes(limb_t *restrict ret,
+                                       const unsigned char *in, size_t n)
+{
+    limb_t limb = 0;
+
+    while(n--) {
+        limb <<= 8;
+        limb |= in[n];
+        /*
+         * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
+         * to perform redundant stores than to pay penalty for
+         * mispredicted branch. Besides, some compilers unroll the
+         * loop and remove redundant stores to 'restict'-ed storage...
+         */
+        ret[n / sizeof(limb_t)] = limb;
+    }
+}
+
+static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in,
+                                       size_t n)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    limb_t limb;
+    size_t i, j, r;
+
+    if ((uptr_t)out == (uptr_t)in && is_endian.little)
+        return;
+
+    r = n % sizeof(limb_t);
+    n /= sizeof(limb_t);
+
+    for(i = 0; i < n; i++) {
+        for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8)
+            *out++ = (unsigned char)limb;
+    }
+    if (r) {
+        for (limb = in[i], j = 0; j < r; j++, limb >>= 8)
+            *out++ = (unsigned char)limb;
+    }
+}
+
+static inline char hex_from_nibble(unsigned char nibble)
+{
+    int mask = (9 - (nibble &= 0xf)) >> 31;
+    return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask)));
+}
+
+static unsigned char nibble_from_hex(char c)
+{
+    int mask, ret;
+
+    mask = (('a'-c-1) & (c-1-'f')) >> 31;
+    ret  = (10 + c - 'a') & mask;
+    mask = (('A'-c-1) & (c-1-'F')) >> 31;
+    ret |= (10 + c - 'A') & mask;
+    mask = (('0'-c-1) & (c-1-'9')) >> 31;
+    ret |= (c - '0') & mask;
+    mask = ((ret-1) & ~mask) >> 31;
+    ret |= 16 & mask;
+
+    return (unsigned char)ret;
+}
+
+static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex)
+{
+    size_t len;
+    unsigned char b = 0;
+
+    if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X'))
+        hex += 2;
+
+    for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ;
+
+    bytes_zero(ret, sz);
+
+    while(len--) {
+        b <<= 4;
+        b |= nibble_from_hex(*hex++);
+        if (len % 2 == 0)
+            ret[len / 2] = b;
+    }
+}
+
+static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex)
+{
+    size_t len;
+    limb_t limb = 0;
+
+    if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X'))
+        hex += 2;
+
+    for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ;
+
+    vec_zero(ret, sz);
+
+    while(len--) {
+        limb <<= 4;
+        limb |= nibble_from_hex(*hex++);
+        if (len % (2*sizeof(limb_t)) == 0)
+            ret[len / (2*sizeof(limb_t))] = limb;
+    }
+}
+
+#endif
diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c
new file mode 100644
index 00000000000..0fcf563f502
--- /dev/null
+++ b/crypto/blst_src/client_min_pk.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e2.c"
+#include "hash_to_field.c"
+#include "map_to_g2.c"
+#include "e1.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c
new file mode 100644
index 00000000000..8e4663daede
--- /dev/null
+++ b/crypto/blst_src/client_min_sig.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e1.c"
+#include "hash_to_field.c"
+#include "map_to_g1.c"
+#include "e2.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/crypto/blst_src/consts.c b/crypto/blst_src/consts.c
new file mode 100644
index 00000000000..021c878a258
--- /dev/null
+++ b/crypto/blst_src/consts.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+
+/* z = -0xd201000000010000 */
+const vec384 BLS12_381_P = {    /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */
+    TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff),
+    TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf),
+    TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a)
+};
+const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd;  /* -1/P */
+
+const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */
+  { { ONE_MONT_P },
+    { 0 } }
+};
+
+const vec384 BLS12_381_RR = {   /* (1<<768)%P, "radix"^2, to-Montgomery */
+    TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1),
+    TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0),
+    TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa)
+};
+
+const vec256 BLS12_381_r = {    /* z^4 - z^2 + 1, group order */
+    TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe),
+    TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48)
+};
+
+const vec256 BLS12_381_rRR = {  /* (1<<512)%r, "radix"^2, to-Montgomery */
+    TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23),
+    TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11)
+};
diff --git a/crypto/blst_src/consts.h b/crypto/blst_src/consts.h
new file mode 100644
index 00000000000..cb391b817df
--- /dev/null
+++ b/crypto/blst_src/consts.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_CONST_H__
+#define __BLS12_381_ASM_CONST_H__
+#include "vect.h"
+
+extern const vec384 BLS12_381_P;
+extern const limb_t BLS12_381_p0;
+static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd;  /* -1/P */
+typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384;
+extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */
+extern const vec384 BLS12_381_RR;   /* (1<<768)%P, "radix"^2, to-Montgomery   */
+
+#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \
+                   TO_LIMB_T(0xebf4000bc40c0002), \
+                   TO_LIMB_T(0x5f48985753c758ba), \
+                   TO_LIMB_T(0x77ce585370525745), \
+                   TO_LIMB_T(0x5c071a97a256ec6d), \
+                   TO_LIMB_T(0x15f65ec3fa80e493)
+
+#define ZERO_384 (BLS12_381_Rx.p2[1])
+
+extern const vec256 BLS12_381_r;    /* order */
+static const limb_t r0 = (limb_t)0xfffffffeffffffff;  /* -1/r */
+extern const vec256 BLS12_381_rRR;  /* (1<<512)%r, "radix"^2, to-Montgomery   */
+
+#endif
diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c
new file mode 100644
index 00000000000..91c4cdbf39c
--- /dev/null
+++ b/crypto/blst_src/e1.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+#include "errors.h"
+
+/*
+ * y^2 = x^3 + B
+ */
+static const vec384 B_E1 = {        /* (4 << 384) % P */
+    TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e)
+};
+
+const POINTonE1 BLS12_381_G1 = {    /* generator point [in Montgomery] */
+  /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905
+   *    a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */
+  { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5),
+    TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747),
+    TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) },
+  /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6
+   *    00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */
+  { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e),
+    TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194),
+    TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) },
+  { ONE_MONT_P }
+};
+
+const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */
+  /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905
+   *    a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */
+  { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5),
+    TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747),
+    TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) },
+  /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9
+   *    6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */
+  { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270),
+    TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a),
+    TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) },
+  { ONE_MONT_P }
+};
+
+static inline void mul_by_b_onE1(vec384 out, const vec384 in)
+{   lshift_fp(out, in, 2);   }
+
+static inline void mul_by_4b_onE1(vec384 out, const vec384 in)
+{   lshift_fp(out, in, 4);   }
+
+static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit)
+{   cneg_fp(p->Y, p->Y, cbit);   }
+
+void blst_p1_cneg(POINTonE1 *a, int cbit)
+{   POINTonE1_cneg(a, is_zero(cbit) ^ 1);   }
+
+static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in)
+{
+    vec384 Z, ZZ;
+    limb_t inf = vec_is_zero(in->Z, sizeof(in->Z));
+
+    reciprocal_fp(Z, in->Z);                            /* 1/Z   */
+
+    sqr_fp(ZZ, Z);
+    mul_fp(out->X, in->X, ZZ);                          /* X = X/Z^2 */
+
+    mul_fp(ZZ, ZZ, Z);
+    mul_fp(out->Y, in->Y, ZZ);                          /* Y = Y/Z^3 */
+
+    vec_select(out->Z, in->Z, BLS12_381_G1.Z,
+                       sizeof(BLS12_381_G1.Z), inf);    /* Z = inf ? 0 : 1 */
+}
+
+void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a)
+{   POINTonE1_from_Jacobian(out, a);   }
+
+static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+    vec_copy(out, in, sizeof(*out));
+}
+
+void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a)
+{   POINTonE1_to_affine(out, a);   }
+
+void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a)
+{
+    vec_copy(out, a, sizeof(*a));
+    vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z),
+                       vec_is_zero(a, sizeof(*a)));
+}
+
+static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p)
+{
+    vec384 XXX, YY;
+
+    sqr_fp(XXX, p->X);
+    mul_fp(XXX, XXX, p->X);                             /* X^3 */
+    add_fp(XXX, XXX, B_E1);                             /* X^3 + B */
+
+    sqr_fp(YY, p->Y);                                   /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX));
+}
+
+int blst_p1_affine_on_curve(const POINTonE1_affine *p)
+{   return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)));   }
+
+static bool_t POINTonE1_on_curve(const POINTonE1 *p)
+{
+    vec384 XXX, YY, BZ6;
+    limb_t inf = vec_is_zero(p->Z, sizeof(p->Z));
+
+    sqr_fp(BZ6, p->Z);
+    mul_fp(BZ6, BZ6, p->Z);
+    sqr_fp(BZ6, BZ6);                                   /* Z^6 */
+    mul_by_b_onE1(BZ6, BZ6);                            /* B*Z^6 */
+
+    sqr_fp(XXX, p->X);
+    mul_fp(XXX, XXX, p->X);                             /* X^3 */
+    add_fp(XXX, XXX, BZ6);                              /* X^3 + B*Z^6 */
+
+    sqr_fp(YY, p->Y);                                   /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX)) | inf;
+}
+
+int blst_p1_on_curve(const POINTonE1 *p)
+{   return (int)POINTonE1_on_curve(p);   }
+
+static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96],
+                                            const POINTonE1_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+
+    from_fp(temp, in->Y);
+    be_bytes_from_limbs(out + 48, temp, sizeof(temp));
+
+    return sgn0_pty_mod_384(temp, BLS12_381_P);
+}
+
+void blst_p1_affine_serialize(unsigned char out[96],
+                              const POINTonE1_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 96);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE1_affine_Serialize_BE(out, in);
+    }
+}
+
+static limb_t POINTonE1_Serialize_BE(unsigned char out[96],
+                                     const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in);
+}
+
+static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 96);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE1_Serialize_BE(out, in);
+    }
+}
+
+void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in)
+{   POINTonE1_Serialize(out, in);   }
+
+static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48],
+                                           const POINTonE1_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+
+    return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0);
+}
+
+void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 48);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE1_affine_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE1_Compress_BE(unsigned char out[48],
+                                    const POINTonE1 *in)
+{
+    POINTonE1 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE1_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in);
+}
+
+void blst_p1_compress(unsigned char out[48], const POINTonE1 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 48);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE1_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out,
+                                      const unsigned char in[48])
+{
+    POINTonE1_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X, in, sizeof(ret.X));
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X, sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+    mul_fp(ret.X, ret.X, BLS12_381_RR);
+
+    sqr_fp(ret.Y, ret.X);
+    mul_fp(ret.Y, ret.Y, ret.X);
+    add_fp(ret.Y, ret.Y, B_E1);                         /* X^3 + B */
+    if (!sqrt_fp(ret.Y, ret.Y))
+        return (limb_t)0 - BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0);
+}
+
+static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out,
+                                         const unsigned char in[48])
+{
+    unsigned char in0 = in[0];
+    limb_t sgn0_pty;
+
+    if ((in0 & 0x80) == 0)      /* compressed bit */
+        return BLST_BAD_ENCODING;
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        } else {
+            return BLST_BAD_ENCODING;
+        }
+    }
+
+    sgn0_pty = POINTonE1_Uncompress_BE(out, in);
+
+    if (sgn0_pty > 3)
+        return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */
+
+    sgn0_pty >>= 1; /* skip over parity bit */
+    sgn0_pty ^= (in0 & 0x20) >> 5;
+    cneg_fp(out->Y, out->Y, sgn0_pty);
+
+    /* (0,±2) is not in group, but application might want to ignore? */
+    return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP
+                                               : BLST_SUCCESS;
+}
+
+BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48])
+{   return POINTonE1_Uncompress_Z(out, in);   }
+
+static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out,
+                                           const unsigned char in[96])
+{
+    POINTonE1_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X, in, sizeof(ret.X));
+    limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X, sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y, ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y, sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    mul_fp(ret.X, ret.X, BLS12_381_RR);
+    mul_fp(ret.Y, ret.Y, BLS12_381_RR);
+
+    if (!POINTonE1_affine_on_curve(&ret))
+        return BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    /* (0,±2) is not in group, but application might want to ignore? */
+    return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP
+                                               : BLST_SUCCESS;
+}
+
+static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out,
+                                          const unsigned char in[96])
+{
+    unsigned char in0 = in[0];
+
+    if ((in0 & 0xe0) == 0)
+        return POINTonE1_Deserialize_BE(out, in);
+
+    if (in0 & 0x80)             /* compressed bit */
+        return POINTonE1_Uncompress_Z(out, in);
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        }
+    }
+
+    return BLST_BAD_ENCODING;
+}
+
+BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out,
+                               const unsigned char in[96])
+{   return POINTonE1_Deserialize_Z(out, in);   }
+
+#include "ec_ops.h"
+POINT_DADD_IMPL(POINTonE1, 384, fp)
+POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINT_ADD_IMPL(POINTonE1, 384, fp)
+POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp)
+POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp)
+
+void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b)
+{   POINTonE1_add(out, a, b);   }
+
+void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a,
+                                           const POINTonE1 *b)
+{   POINTonE1_dadd(out, a, b, NULL);   }
+
+void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a,
+                                        const POINTonE1_affine *b)
+{   POINTonE1_add_affine(out, a, b);   }
+
+void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a,
+                                                  const POINTonE1_affine *b)
+{   POINTonE1_dadd_affine(out, a, b);   }
+
+void blst_p1_double(POINTonE1 *out, const POINTonE1 *a)
+{   POINTonE1_double(out, a);   }
+
+int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b)
+{   return (int)POINTonE1_is_equal(a, b);   }
+
+#include "ec_mult.h"
+POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4)
+POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5)
+
+#ifdef __BLST_PRIVATE_TESTMODE__
+POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1)
+
+DECLARE_PRIVATE_POINTXZ(POINTonE1, 384)
+POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp)
+POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1)
+POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1)
+POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1)
+#endif
+
+static const vec384 beta = {            /* such that beta^3 - 1 = 0  */
+    /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */
+    /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4
+          897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */
+    TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+    TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+    TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741)
+};
+
+static void sigma(POINTonE1 *out, const POINTonE1 *in)
+{
+    vec_copy(out->X, in->X, 2*sizeof(out->X));
+    mul_fp(out->Z, in->Z, beta);
+}
+
+/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */
+static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in,
+                               const pow256 SK)
+{
+    union { vec256 l; pow256 s; } val;
+
+    /* SK/z^2 [in constant time] */
+
+    limbs_from_le_bytes(val.l, SK, 32);
+    div_by_zz(val.l);
+    le_bytes_from_limbs(val.s, val.l, 32);
+
+    {
+        const byte *scalars[2] = { val.s+16, val.s };
+        POINTonE1 table[2][1<<(5-1)];   /* 4.5KB */
+        size_t i;
+
+        POINTonE1_precompute_w5(table[0], in);
+        for (i = 0; i < 1<<(5-1); i++) {
+            mul_fp(table[1][i].X, table[0][i].X, beta);
+            cneg_fp(table[1][i].Y, table[0][i].Y, 1);
+            vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z));
+        }
+
+        POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table);
+        POINTonE1_cneg(out, 1);
+        mul_fp(out->Z, out->Z, beta);
+        mul_fp(out->Z, out->Z, beta);
+    }
+
+    vec_zero(val.l, sizeof(val));   /* scrub the copy of SK */
+}
+
+static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK)
+{
+    vec384 Z, ZZ;
+    limb_t inf;
+
+    POINTonE1_mult_glv(out, in, SK);
+
+    /* convert to affine to remove possible bias in out->Z */
+    inf = vec_is_zero(out->Z, sizeof(out->Z));
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    flt_reciprocal_fp(Z, out->Z);                       /* 1/Z   */
+#else
+    reciprocal_fp(Z, out->Z);                           /* 1/Z   */
+#endif
+
+    sqr_fp(ZZ, Z);
+    mul_fp(out->X, out->X, ZZ);                         /* X = X/Z^2 */
+
+    mul_fp(ZZ, ZZ, Z);
+    mul_fp(out->Y, out->Y, ZZ);                         /* Y = Y/Z^3 */
+
+    vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z),
+                       inf);                            /* Z = inf ? 0 : 1 */
+}
+
+void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK)
+{   POINTonE1_sign(out, &BLS12_381_G1, SK);   }
+
+void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK)
+{   POINTonE1_sign(out, msg, SK);   }
+
+void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK,
+                          const pow256 SK)
+{
+    POINTonE1 P[1];
+
+    POINTonE1_sign(P, &BLS12_381_G1, SK);
+    if (PK != NULL)
+        vec_copy(PK, P, sizeof(*PK));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig,
+                         const POINTonE1 *hash, const pow256 SK)
+{
+    POINTonE1 P[1];
+
+    POINTonE1_sign(P, hash, SK);
+    if (sig != NULL)
+        vec_copy(sig, P, sizeof(*sig));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a,
+                                  const byte *scalar, size_t nbits)
+{
+    if (nbits < 176) {
+        if (nbits)
+            POINTonE1_mult_w4(out, a, scalar, nbits);
+        else
+            vec_zero(out, sizeof(*out));
+    } else if (nbits <= 256) {
+        union { vec256 l; pow256 s; } val;
+        size_t i, j, top, mask = (size_t)0 - 1;
+
+        /* this is not about constant-time-ness, but branch optimization */
+        for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) {
+            val.s[i++] = scalar[j] & mask;
+            mask = 0 - ((i - top) >> (8*sizeof(top)-1));
+            j += 1 & mask;
+        }
+
+        if (check_mod_256(val.s, BLS12_381_r))  /* z^4 is the formal limit */
+            POINTonE1_mult_glv(out, a, val.s);
+        else    /* should never be the case, added for formal completeness */
+            POINTonE1_mult_w5(out, a, scalar, nbits);
+
+        vec_zero(val.l, sizeof(val));
+    } else {    /* should never be the case, added for formal completeness */
+        POINTonE1_mult_w5(out, a, scalar, nbits);
+    }
+}
+
+void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a,
+                                            const byte *scalar, size_t nbits)
+{
+    if (nbits)
+        POINTonE1_mult_w4(out, a, scalar, nbits);
+    else
+        vec_zero(out, sizeof(*out));
+}
+
+int blst_p1_affine_is_equal(const POINTonE1_affine *a,
+                            const POINTonE1_affine *b)
+{   return (int)vec_is_equal(a, b, sizeof(*a));   }
+
+int blst_p1_is_inf(const POINTonE1 *p)
+{   return (int)vec_is_zero(p->Z, sizeof(p->Z));   }
+
+const POINTonE1 *blst_p1_generator(void)
+{   return &BLS12_381_G1;   }
+
+int blst_p1_affine_is_inf(const POINTonE1_affine *p)
+{   return (int)vec_is_zero(p, sizeof(*p));   }
+
+const POINTonE1_affine *blst_p1_affine_generator(void)
+{   return (const POINTonE1_affine *)&BLS12_381_G1;   }
+
+size_t blst_p1_sizeof(void)
+{   return sizeof(POINTonE1);   }
+
+size_t blst_p1_affine_sizeof(void)
+{   return sizeof(POINTonE1_affine);   }
diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c
new file mode 100644
index 00000000000..822ac23c694
--- /dev/null
+++ b/crypto/blst_src/e2.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+#include "errors.h"
+
+/*
+ * y^2 = x^3 + B
+ */
+static const vec384x B_E2 = {       /* 4 + 4*i */
+  { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) },
+  { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a),
+    TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7),
+    TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }
+};
+
+const POINTonE2 BLS12_381_G2 = {    /* generator point [in Montgomery] */
+{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02
+        b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */
+  { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a),
+    TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9),
+    TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) },
+  /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a
+        b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */
+  { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3),
+    TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367),
+    TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) }
+},
+{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7
+        6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */
+  { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a),
+    TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f),
+    TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) },
+  /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af
+        267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */
+  { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc),
+    TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a),
+    TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) },
+},
+{ { ONE_MONT_P }, { 0 } }
+};
+
+const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */
+{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02
+        b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */
+  { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a),
+    TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9),
+    TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) },
+  /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a
+        b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */
+  { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3),
+    TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367),
+    TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) }
+},
+{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17
+        f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */
+  { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5),
+    TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f),
+    TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) },
+  /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10
+        40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */
+  { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23),
+    TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84),
+    TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) }
+},
+{ { ONE_MONT_P }, { 0 } }
+};
+
+static void mul_by_b_onE2(vec384x out, const vec384x in)
+{
+    sub_fp(out[0], in[0], in[1]);
+    add_fp(out[1], in[0], in[1]);
+    lshift_fp(out[0], out[0], 2);
+    lshift_fp(out[1], out[1], 2);
+}
+
+static void mul_by_4b_onE2(vec384x out, const vec384x in)
+{
+    sub_fp(out[0], in[0], in[1]);
+    add_fp(out[1], in[0], in[1]);
+    lshift_fp(out[0], out[0], 4);
+    lshift_fp(out[1], out[1], 4);
+}
+
+static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit)
+{   cneg_fp2(p->Y, p->Y, cbit);   }
+
+void blst_p2_cneg(POINTonE2 *a, int cbit)
+{   POINTonE2_cneg(a, is_zero(cbit) ^ 1);   }
+
+static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in)
+{
+    vec384x Z, ZZ;
+    limb_t inf = vec_is_zero(in->Z, sizeof(in->Z));
+
+    reciprocal_fp2(Z, in->Z);                           /* 1/Z */
+
+    sqr_fp2(ZZ, Z);
+    mul_fp2(out->X, in->X, ZZ);                         /* X = X/Z^2 */
+
+    mul_fp2(ZZ, ZZ, Z);
+    mul_fp2(out->Y, in->Y, ZZ);                         /* Y = Y/Z^3 */
+
+    vec_select(out->Z, in->Z, BLS12_381_G2.Z,
+                       sizeof(BLS12_381_G2.Z), inf);    /* Z = inf ? 0 : 1 */
+}
+
+void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a)
+{   POINTonE2_from_Jacobian(out, a);   }
+
+static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+    vec_copy(out, in, sizeof(*out));
+}
+
+void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a)
+{   POINTonE2_to_affine(out, a);   }
+
+void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a)
+{
+    vec_copy(out, a, sizeof(*a));
+    vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z),
+                       vec_is_zero(a, sizeof(*a)));
+}
+
+static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p)
+{
+    vec384x XXX, YY;
+
+    sqr_fp2(XXX, p->X);
+    mul_fp2(XXX, XXX, p->X);                            /* X^3 */
+    add_fp2(XXX, XXX, B_E2);                            /* X^3 + B */
+
+    sqr_fp2(YY, p->Y);                                  /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX));
+}
+
+int blst_p2_affine_on_curve(const POINTonE2_affine *p)
+{   return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)));   }
+
+static bool_t POINTonE2_on_curve(const POINTonE2 *p)
+{
+    vec384x XXX, YY, BZ6;
+    limb_t inf = vec_is_zero(p->Z, sizeof(p->Z));
+
+    sqr_fp2(BZ6, p->Z);
+    mul_fp2(BZ6, BZ6, p->Z);
+    sqr_fp2(XXX, BZ6);                                  /* Z^6 */
+    mul_by_b_onE2(BZ6, XXX);                            /* B*Z^6 */
+
+    sqr_fp2(XXX, p->X);
+    mul_fp2(XXX, XXX, p->X);                            /* X^3 */
+    add_fp2(XXX, XXX, BZ6);                             /* X^3 + B*Z^6 */
+
+    sqr_fp2(YY, p->Y);                                  /* Y^2 */
+
+    return vec_is_equal(XXX, YY, sizeof(XXX)) | inf;
+}
+
+int blst_p2_on_curve(const POINTonE2 *p)
+{   return (int)POINTonE2_on_curve(p);   }
+
+static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192],
+                                            const POINTonE2_affine *in)
+{
+    vec384x temp;
+
+    from_fp(temp[1], in->X[1]);
+    be_bytes_from_limbs(out, temp[1], sizeof(temp[1]));
+    from_fp(temp[0], in->X[0]);
+    be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0]));
+
+    from_fp(temp[1], in->Y[1]);
+    be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1]));
+    from_fp(temp[0], in->Y[0]);
+    be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0]));
+
+    return sgn0_pty_mod_384x(temp, BLS12_381_P);
+}
+
+void blst_p2_affine_serialize(unsigned char out[192],
+                              const POINTonE2_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 192);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE2_affine_Serialize_BE(out, in);
+    }
+}
+
+static limb_t POINTonE2_Serialize_BE(unsigned char out[192],
+                                     const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in);
+}
+
+static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 192);
+        out[0] = 0x40;    /* infinitiy bit */
+    } else {
+        (void)POINTonE2_Serialize_BE(out, in);
+    }
+}
+
+void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in)
+{   POINTonE2_Serialize(out, in);   }
+
+static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96],
+                                           const POINTonE2_affine *in)
+{
+    vec384 temp;
+
+    from_fp(temp, in->X[1]);
+    be_bytes_from_limbs(out, temp, sizeof(temp));
+    from_fp(temp, in->X[0]);
+    be_bytes_from_limbs(out + 48, temp, sizeof(temp));
+
+    return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0);
+}
+
+void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in)
+{
+    if (vec_is_zero(in->X, 2*sizeof(in->X))) {
+        bytes_zero(out, 96);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE2_affine_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE2_Compress_BE(unsigned char out[96],
+                                    const POINTonE2 *in)
+{
+    POINTonE2 p;
+
+    if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) {
+        POINTonE2_from_Jacobian(&p, in);
+        in = &p;
+    }
+
+    return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in);
+}
+
+void blst_p2_compress(unsigned char out[96], const POINTonE2 *in)
+{
+    if (vec_is_zero(in->Z, sizeof(in->Z))) {
+        bytes_zero(out, 96);
+        out[0] = 0xc0;    /* compressed and infinitiy bits */
+    } else {
+        limb_t sign = POINTonE2_Compress_BE(out, in);
+        out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
+    }
+}
+
+static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out,
+                                      const unsigned char in[96])
+{
+    POINTonE2_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1]));
+    limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0]));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[1], sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.X[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[0], sizeof(temp)))
+        return (limb_t)0 - BLST_BAD_ENCODING;
+
+    mul_fp(ret.X[0], ret.X[0], BLS12_381_RR);
+    mul_fp(ret.X[1], ret.X[1], BLS12_381_RR);
+
+    sqr_fp2(ret.Y, ret.X);
+    mul_fp2(ret.Y, ret.Y, ret.X);
+    add_fp2(ret.Y, ret.Y, B_E2);                        /* X^3 + B */
+    if (!sqrt_fp2(ret.Y, ret.Y))
+        return (limb_t)0 - BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0);
+}
+
+static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out,
+                                         const unsigned char in[96])
+{
+    unsigned char in0 = in[0];
+    limb_t sgn0_pty;
+
+    if ((in0 & 0x80) == 0)      /* compressed bit */
+        return BLST_BAD_ENCODING;
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        } else {
+            return BLST_BAD_ENCODING;
+        }
+    }
+
+    sgn0_pty = POINTonE2_Uncompress_BE(out, in);
+
+    if (sgn0_pty > 3)
+        return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */
+
+    sgn0_pty >>= 1; /* skip over parity bit */
+    sgn0_pty ^= (in0 & 0x20) >> 5;
+    cneg_fp2(out->Y, out->Y, sgn0_pty);
+
+    return BLST_SUCCESS;
+}
+
+BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96])
+{   return POINTonE2_Uncompress_Z(out, in);   }
+
+static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out,
+                                           const unsigned char in[192])
+{
+    POINTonE2_affine ret;
+    vec384 temp;
+
+    limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1]));
+    limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0]));
+    limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1]));
+    limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0]));
+
+    /* clear top 3 bits in case caller was conveying some information there */
+    ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3;
+    add_fp(temp, ret.X[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[1], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.X[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.X[0], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y[1], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y[1], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    add_fp(temp, ret.Y[0], ZERO_384);  /* less than modulus? */
+    if (!vec_is_equal(temp, ret.Y[0], sizeof(temp)))
+        return BLST_BAD_ENCODING;
+
+    mul_fp(ret.X[0], ret.X[0], BLS12_381_RR);
+    mul_fp(ret.X[1], ret.X[1], BLS12_381_RR);
+    mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR);
+    mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR);
+
+    if (!POINTonE2_affine_on_curve(&ret))
+        return BLST_POINT_NOT_ON_CURVE;
+
+    vec_copy(out, &ret, sizeof(ret));
+
+    return BLST_SUCCESS;
+}
+
+static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out,
+                                          const unsigned char in[192])
+{
+    unsigned char in0 = in[0];
+
+    if ((in0 & 0xe0) == 0)
+        return POINTonE2_Deserialize_BE(out, in);
+
+    if (in0 & 0x80)             /* compressed bit */
+        return POINTonE2_Uncompress_Z(out, in);
+
+    if (in0 & 0x40) {           /* infinity bit */
+        if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) {
+            vec_zero(out, sizeof(*out));
+            return BLST_SUCCESS;
+        }
+    }
+
+    return BLST_BAD_ENCODING;
+}
+
+BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out,
+                               const unsigned char in[192])
+{   return POINTonE2_Deserialize_Z(out, in);   }
+
+#include "ec_ops.h"
+POINT_DADD_IMPL(POINTonE2, 384x, fp2)
+POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINT_ADD_IMPL(POINTonE2, 384x, fp2)
+POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2)
+POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2)
+
+void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b)
+{   POINTonE2_add(out, a, b);   }
+
+void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a,
+                                           const POINTonE2 *b)
+{   POINTonE2_dadd(out, a, b, NULL);   }
+
+void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a,
+                                        const POINTonE2_affine *b)
+{   POINTonE2_add_affine(out, a, b);   }
+
+void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a,
+                                                  const POINTonE2_affine *b)
+{   POINTonE2_dadd_affine(out, a, b);   }
+
+void blst_p2_double(POINTonE2 *out, const POINTonE2 *a)
+{   POINTonE2_double(out, a);   }
+
+int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b)
+{   return (int)POINTonE2_is_equal(a, b);   }
+
+#include "ec_mult.h"
+POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4)
+POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5)
+
+#ifdef __BLST_PRIVATE_TESTMODE__
+POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2)
+
+DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x)
+POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2)
+POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2)
+POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2)
+POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2)
+#endif
+
+static void psi(POINTonE2 *out, const POINTonE2 *in)
+{
+    static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */
+      { 0 },
+      { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4
+              897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */
+        TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5),
+        TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024),
+        TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }
+    };
+    static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */
+      { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60
+              ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */
+        TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+        TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+        TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+      { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e
+              77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */
+        TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) },
+    };
+
+    vec_copy(out, in, sizeof(*out));
+    cneg_fp(out->X[1], out->X[1], 1);   mul_fp2(out->X, out->X, frobenius_x);
+    cneg_fp(out->Y[1], out->Y[1], 1);   mul_fp2(out->Y, out->Y, frobenius_y);
+    cneg_fp(out->Z[1], out->Z[1], 1);
+}
+
+/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */
+static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in,
+                               const pow256 SK)
+{
+    union { vec256 l; pow256 s; } val;
+
+    /* break down SK to "digits" with |z| as radix [in constant time] */
+
+    limbs_from_le_bytes(val.l, SK, 32);
+    div_by_zz(val.l);
+    div_by_z(val.l);
+    div_by_z(val.l + NLIMBS(256)/2);
+    le_bytes_from_limbs(val.s, val.l, 32);
+
+    {
+        const byte *scalars[2] = { val.s, NULL };
+        POINTonE2 table[4][1<<(5-1)];   /* 18KB */
+        size_t i;
+
+        POINTonE2_precompute_w5(table[0], in);
+        for (i = 0; i < 1<<(5-1); i++) {
+            psi(&table[1][i], &table[0][i]);
+            psi(&table[2][i], &table[1][i]);
+            psi(&table[3][i], &table[2][i]);
+            POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */
+            POINTonE2_cneg(&table[3][i], 1);
+        }
+
+        POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table);
+    }
+
+    vec_zero(val.l, sizeof(val));   /* scrub the copy of SK */
+}
+
+static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK)
+{
+    vec384x Z, ZZ;
+    limb_t inf;
+
+    POINTonE2_mult_gls(out, in, SK);
+
+    /* convert to affine to remove possible bias in out->Z */
+    inf = vec_is_zero(out->Z, sizeof(out->Z));
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    flt_reciprocal_fp2(Z, out->Z);                      /* 1/Z   */
+#else
+    reciprocal_fp2(Z, out->Z);                          /* 1/Z   */
+#endif
+
+    sqr_fp2(ZZ, Z);
+    mul_fp2(out->X, out->X, ZZ);                        /* X = X/Z^2 */
+
+    mul_fp2(ZZ, ZZ, Z);
+    mul_fp2(out->Y, out->Y, ZZ);                        /* Y = Y/Z^3 */
+
+    vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z),
+                       inf);                            /* Z = inf ? 0 : 1 */
+}
+
+void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK)
+{   POINTonE2_sign(out, &BLS12_381_G2, SK);   }
+
+void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK)
+{   POINTonE2_sign(out, msg, SK);   }
+
+void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK,
+                          const pow256 SK)
+{
+    POINTonE2 P[1];
+
+    POINTonE2_sign(P, &BLS12_381_G2, SK);
+    if (PK != NULL)
+        vec_copy(PK, P, sizeof(*PK));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig,
+                         const POINTonE2 *hash, const pow256 SK)
+{
+    POINTonE2 P[1];
+
+    POINTonE2_sign(P, hash, SK);
+    if (sig != NULL)
+        vec_copy(sig, P, sizeof(*sig));
+    if (out != NULL) {
+        limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P);
+        out[0] |= (sgn0_pty & 2) << 4;      /* pre-decorate */
+        out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6;
+    }
+}
+
+void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a,
+                                  const byte *scalar, size_t nbits)
+{
+    if (nbits < 144) {
+        if (nbits)
+            POINTonE2_mult_w4(out, a, scalar, nbits);
+        else
+            vec_zero(out, sizeof(*out));
+    } else if (nbits <= 256) {
+        union { vec256 l; pow256 s; } val;
+        size_t i, j, top, mask = (size_t)0 - 1;
+
+        /* this is not about constant-time-ness, but branch optimization */
+        for (top = (nbits + 7)/8, i=0, j=0; i<sizeof(val.s);) {
+            val.s[i++] = scalar[j] & mask;
+            mask = 0 - ((i - top) >> (8*sizeof(top)-1));
+            j += 1 & mask;
+        }
+
+        if (check_mod_256(val.s, BLS12_381_r))  /* z^4 is the formal limit */
+            POINTonE2_mult_gls(out, a, val.s);
+        else    /* should never be the case, added for formal completeness */
+            POINTonE2_mult_w5(out, a, scalar, nbits);
+
+        vec_zero(val.l, sizeof(val));
+    } else {    /* should never be the case, added for formal completeness */
+        POINTonE2_mult_w5(out, a, scalar, nbits);
+    }
+}
+
+void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a,
+                                            const byte *scalar, size_t nbits)
+{
+    if (nbits)
+        POINTonE2_mult_w4(out, a, scalar, nbits);
+    else
+        vec_zero(out, sizeof(*out));
+}
+
+int blst_p2_affine_is_equal(const POINTonE2_affine *a,
+                            const POINTonE2_affine *b)
+{   return (int)vec_is_equal(a, b, sizeof(*a));   }
+
+int blst_p2_is_inf(const POINTonE2 *p)
+{   return (int)vec_is_zero(p->Z, sizeof(p->Z));   }
+
+const POINTonE2 *blst_p2_generator(void)
+{   return &BLS12_381_G2;   }
+
+int blst_p2_affine_is_inf(const POINTonE2_affine *p)
+{   return (int)vec_is_zero(p, sizeof(*p));   }
+
+const POINTonE2_affine *blst_p2_affine_generator(void)
+{   return (const POINTonE2_affine *)&BLS12_381_G2;   }
+
+size_t blst_p2_sizeof(void)
+{   return sizeof(POINTonE2);   }
+
+size_t blst_p2_affine_sizeof(void)
+{   return sizeof(POINTonE2_affine);   }
diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h
new file mode 100644
index 00000000000..192f7337cbf
--- /dev/null
+++ b/crypto/blst_src/ec_mult.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_EC_MULT_H__
+#define __BLS12_381_ASM_EC_MULT_H__
+
+#include "point.h"
+
+/* Works up to 9 bits */
+static limb_t get_wval(const byte *d, size_t off, size_t bits)
+{
+    size_t top = off + bits - 1;
+    limb_t ret;
+
+    ret = ((limb_t)d[top / 8] << 8) | d[off / 8];
+
+    return ret >> (off%8);
+}
+
+/* Works up to 25 bits. */
+static limb_t get_wval_limb(const byte *d, size_t off, size_t bits)
+{
+    size_t i, top = (off + bits - 1)/8;
+    limb_t ret, mask = (limb_t)0 - 1;
+
+    d   += off/8;
+    top -= off/8-1;
+
+    /* this is not about constant-time-ness, but branch optimization */
+    for (ret=0, i=0; i<4;) {
+        ret |= (*d & mask) << (8*i);
+        mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1));
+        d += 1 & mask;
+    }
+
+    return ret >> (off%8);
+}
+
+/*
+ * Window value encoding that utilizes the fact that -P is trivially
+ * calculated, which allows to halve the size of pre-computed table,
+ * is attributed to A. D. Booth, hence the name of the subroutines...
+ */
+static limb_t booth_encode(limb_t wval, size_t sz)
+{
+    limb_t mask = 0 - (wval >> sz);     /* "sign" bit -> mask */
+
+    wval = (wval + 1) >> 1;
+    wval = (wval & ~mask) | ((0-wval) & mask);
+
+    /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */
+    return wval;
+}
+
+/*
+ * Key feature of these constant-time subroutines is that they tolerate
+ * zeros in most significant bit positions of the scalar[s], or in other
+ * words, zero-padded scalar values. This means that one can and should
+ * pass order's bit-length, which is customarily publicly known, instead
+ * of the factual scalars' bit-lengths. This is facilitated by point
+ * addition subroutines implemented to handle points at infinity, which
+ * are encoded as Z==0. [Doubling agorithms handle such points at
+ * infinity "naturally," since resulting Z is product of original Z.]
+ */
+#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \
+static void ptype##_gather_booth_w##SZ(ptype *restrict p, \
+                                       const ptype table[1<<(SZ-1)], \
+                                       limb_t booth_idx) \
+{ \
+    size_t i; \
+    bool_t booth_sign = (booth_idx >> SZ) & 1; \
+\
+    booth_idx &= (1<<SZ) - 1; \
+    vec_zero(p, sizeof(ptype)); /* implicit infinity at table[-1] */\
+    /* ~6% with -Os, ~2% with -O3 ... */\
+    for (i = 1; i <= 1<<(SZ-1); i++) \
+        ptype##_ccopy(p, table + i - 1, byte_is_zero((byte)(i ^ booth_idx))); \
+\
+    ptype##_cneg(p, booth_sign); \
+} \
+\
+static void ptype##_precompute_w##SZ(ptype row[], const ptype *point) \
+{ \
+    size_t i, j; \
+                                      /* row[-1] is implicit infinity */\
+    vec_copy(&row[0], point, sizeof(ptype));        /* row[0]=p*1     */\
+    ptype##_double(&row[1],  point);                /* row[1]=p*(1+1) */\
+    for (i = 2, j = 1; i < 1<<(SZ-1); i += 2, j++) \
+        ptype##_add(&row[i], &row[j], &row[j-1]),   /* row[2]=p*(2+1) */\
+        ptype##_double(&row[i+1], &row[j]);         /* row[3]=p*(2+2) */\
+}                                                   /* row[4] ...     */\
+\
+static void ptype##s_mult_w##SZ(ptype *ret, \
+                                const ptype *points[], size_t npoints, \
+                                const byte *scalars[], size_t bits, \
+                                ptype table[][1<<(SZ-1)]) \
+{ \
+    limb_t wmask, wval; \
+    size_t i, j, window, nbytes; \
+    const byte *scalar, **scalar_s = scalars; \
+    ptype temp[1]; \
+\
+    if (table == NULL) \
+        table = (ptype (*)[1<<(SZ-1)])alloca((1<<(SZ-1)) * sizeof(ptype) * \
+                                             npoints); \
+\
+    if (points != NULL) { \
+        const ptype *point = NULL; \
+        for (i = 0; i < npoints; i++) \
+            point = *points ? *points++ : point+1, \
+            ptype##_precompute_w##SZ(table[i], point); \
+    } \
+\
+    nbytes = (bits + 7)/8; /* convert |bits| to bytes */ \
+    scalar = *scalar_s++; \
+\
+    /* top excess bits modulo target window size */ \
+    window = bits % SZ; /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    bits -= window; \
+    if (bits > 0) \
+        wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
+    else \
+        wval = (scalar[0] << 1) & wmask; \
+\
+    wval = booth_encode(wval, SZ); \
+    ptype##_gather_booth_w##SZ(ret, table[0], wval); \
+\
+    i = 1; \
+    while (bits > 0) { \
+        for (; i < npoints; i++) { \
+            scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+            wval = get_wval(scalar, bits - 1, window + 1) & wmask; \
+            wval = booth_encode(wval, SZ); \
+            ptype##_gather_booth_w##SZ(temp, table[i], wval); \
+            ptype##_dadd(ret, ret, temp, NULL); \
+        } \
+\
+        for (j = 0; j < SZ; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = SZ; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        bits -= window; \
+        i = 0; scalar_s = scalars; \
+    } \
+\
+    for (; i < npoints; i++) { \
+        scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+        wval = (scalar[0] << 1) & wmask; \
+        wval = booth_encode(wval, SZ); \
+        ptype##_gather_booth_w##SZ(temp, table[i], wval); \
+        ptype##_dadd(ret, ret, temp, NULL); \
+    } \
+} \
+\
+static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \
+                               const byte *scalar, size_t bits) \
+{ \
+    limb_t wmask, wval; \
+    size_t j, window; \
+    ptype temp[1]; \
+    ptype table[1<<(SZ-1)]; \
+\
+    ptype##_precompute_w##SZ(table, point); \
+\
+    /* top excess bits modulo target window size */ \
+    window = bits % SZ;  /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    bits -= window; \
+    wval = bits ? get_wval(scalar, bits - 1, window + 1) \
+                : (limb_t)scalar[0] << 1; \
+    wval &= wmask; \
+    wval = booth_encode(wval, SZ); \
+    ptype##_gather_booth_w##SZ(ret, table, wval); \
+\
+    while (bits > 0) { \
+        for (j = 0; j < SZ; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = SZ; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        bits -= window; \
+\
+        wval = bits ? get_wval(scalar, bits - 1, window + 1) \
+                    : (limb_t)scalar[0] << 1; \
+        wval &= wmask; \
+        wval = booth_encode(wval, SZ); \
+        ptype##_gather_booth_w##SZ(temp, table, wval); \
+        if (bits > 0) ptype##_add(ret, ret, temp); \
+        else          ptype##_dadd(ret, ret, temp, NULL); \
+    } \
+}
+
+#if 0
+/* ~50%, or ~2x[!] slower than w5... */
+#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
+static void ptype##_mult_ladder(ptype *ret, const ptype *p, \
+                                const byte *scalar, size_t bits) \
+{ \
+    ptype sum[1]; \
+    bool_t bit, pbit = 0; \
+\
+    vec_copy(sum, p, sizeof(ptype)); \
+    vec_zero(ret, sizeof(ptype));   /* infinity */ \
+\
+    while (bits--) { \
+        bit = is_bit_set(scalar, bits); \
+        bit ^= pbit; \
+        ptype##_cswap(ret, sum, bit); \
+        ptype##_add(sum, sum, ret); \
+        ptype##_double(ret, ret); \
+        pbit ^= bit; \
+    } \
+    ptype##_cswap(ret, sum, pbit); \
+}
+#else
+/* >40% better performance than above, [and ~30% slower than w5]... */
+#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \
+static void ptype##_mult_ladder(ptype *out, const ptype *p, \
+                                const byte *scalar, size_t bits) \
+{ \
+    ptype##xz sum[1]; \
+    ptype##xz pxz[1]; \
+    ptype##xz ret[1]; \
+    bool_t bit, pbit = 0; \
+\
+    ptype##xz_ladder_pre(pxz, p); \
+    vec_copy(sum, pxz, sizeof(ptype##xz)); \
+    vec_zero(ret, sizeof(ptype##xz));   /* infinity */ \
+\
+    while (bits--) { \
+        bit = is_bit_set(scalar, bits); \
+        bit ^= pbit; \
+        ptype##xz_cswap(ret, sum, bit); \
+        ptype##xz_ladder_step(ret, sum, pxz); \
+        pbit ^= bit; \
+    } \
+    ptype##xz_cswap(ret, sum, pbit); \
+    ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \
+}
+#endif
+
+/*
+ * Sole reason for existence of this implementation is that addition
+ * with affine point renders a share of multiplications redundant by
+ * virtue of Z==1. And since pre-defined generator point can be and
+ * customarily is instantiated affine, it would be hardly appropriate
+ * to pass on this opportunity. Though while it's faster than the
+ * generic ladder implementation, by ~25%, it's not faster than XZ one
+ * above, <15% slower. Just in case, it's faster than generic ladder
+ * even if one accounts for prior conversion to affine coordinates,
+ * so that choice [for resource-constrained case] is actually between
+ * this plus said conversion and XZ ladder...
+ *
+ * To summarize, if ptype##_mult_w5 executed in one unit of time, then
+ * - naive ptype##_mult_ladder would execute in ~2;
+ * - XZ version above - in ~1.4;
+ * - ptype##_affine_mult_ladder below - in ~1.65;
+ * - [small-footprint ptype##_to_affine would run in ~0.18].
+ *
+ * Caveat lector, |p_affine|*(order+2) produces wrong result, because
+ * addition doesn't handle doubling. Indeed, P*(order+1) is P and it
+ * fails to add with itself producing infinity in last addition. But
+ * as long as |scalar| is reduced modulo order, as it should be, it's
+ * not a problem...
+ */
+#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \
+static void ptype##_affine_mult_ladder(ptype *ret, \
+                                       const ptype##_affine *p_affine, \
+                                       const byte *scalar, size_t bits) \
+{ \
+    ptype sum[1]; \
+    bool_t bit; \
+\
+    vec_zero(ret, sizeof(ptype));   /* infinity */ \
+\
+    while (bits--) { \
+        ptype##_double(ret, ret); \
+        ptype##_add_affine(sum, ret, p_affine); \
+        bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \
+        ptype##_ccopy(ret, sum, bit); \
+    } \
+}
+#endif
diff --git a/crypto/blst_src/ec_ops.h b/crypto/blst_src/ec_ops.h
new file mode 100644
index 00000000000..0d531f816e2
--- /dev/null
+++ b/crypto/blst_src/ec_ops.h
@@ -0,0 +1,787 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_384_ASM_EC_OPS_H__
+#define __BLS12_384_ASM_EC_OPS_H__
+/*
+ * Addition that can handle doubling [as well as points at infinity,
+ * which are encoded as Z==0] in constant time. It naturally comes at
+ * cost, but this subroutine should be called only when independent
+ * points are processed, which is considered reasonable compromise.
+ * For example, ptype##s_mult_w5 calls it, but since *major* gain is
+ * result of pure doublings being effectively divided by amount of
+ * points, slightly slower addition can be tolerated. But what is the
+ * additional cost more specifically? Best addition result is 11M+5S,
+ * while this routine takes 13M+5S (+1M+1S if a4!=0), as per
+ *
+ * -------------+-------------
+ * addition     | doubling
+ * -------------+-------------
+ * U1 = X1*Z2^2 | U1 = X1
+ * U2 = X2*Z1^2 |
+ * S1 = Y1*Z2^3 | S1 = Y1
+ * S2 = Y2*Z1^3 |
+ * zz = Z1*Z2   | zz = Z1
+ * H = U2-U1    | H' = 2*Y1
+ * R = S2-S1    | R' = 3*X1^2[+a*Z1^4]
+ * sx = U1+U2   | sx = X1+X1
+ * -------------+-------------
+ * H!=0 || R!=0 | H==0 && R==0
+ *
+ *      X3 = R^2-H^2*sx
+ *      Y3 = R*(H^2*U1-X3)-H^3*S1
+ *      Z3 = H*zz
+ *
+ * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is
+ * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0.
+ */
+#define POINT_DADD_IMPL(ptype, bits, field) \
+static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \
+                         const vec##bits a4) \
+{ \
+    ptype p3; /* starts as (U1, S1, zz) from addition side */\
+    struct { vec##bits H, R, sx; } add, dbl; \
+    bool_t p1inf, p2inf, is_dbl; \
+\
+    add_##field(dbl.sx, p1->X, p1->X);  /* sx = X1+X1 */\
+    sqr_##field(dbl.R, p1->X);          /* X1^2 */\
+    mul_by_3_##field(dbl.R, dbl.R);     /* R = 3*X1^2 */\
+    add_##field(dbl.H, p1->Y, p1->Y);   /* H = 2*Y1 */\
+\
+    p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+    sqr_##field(p3.X, p2->Z);           /* Z2^2 */\
+    mul_##field(p3.Z, p1->Z, p2->Z);    /* Z1*Z2 */\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(add.H, p1->Z);          /* Z1^2 */\
+\
+    if (a4 != NULL) { \
+        sqr_##field(p3.Y, add.H);       /* Z1^4, [borrow p3.Y] */\
+        mul_##field(p3.Y, p3.Y, a4);    \
+        add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\
+    } \
+\
+    mul_##field(p3.Y, p1->Y, p2->Z);    \
+    mul_##field(p3.Y, p3.Y, p3.X);      /* S1 = Y1*Z2^3 */\
+    mul_##field(add.R, p2->Y, p1->Z);   \
+    mul_##field(add.R, add.R, add.H);   /* S2 = Y2*Z1^3 */\
+    sub_##field(add.R, add.R, p3.Y);    /* R = S2-S1 */\
+\
+    mul_##field(p3.X, p3.X, p1->X);     /* U1 = X1*Z2^2 */\
+    mul_##field(add.H, add.H, p2->X);   /* U2 = X2*Z1^2 */\
+\
+    add_##field(add.sx, add.H, p3.X);   /* sx = U1+U2 */\
+    sub_##field(add.H, add.H, p3.X);    /* H = U2-U1 */\
+\
+    /* make the choice between addition and doubling */\
+    is_dbl = vec_is_zero(add.H, 2*sizeof(add.H));      \
+    vec_select(&p3, p1, &p3, sizeof(p3), is_dbl);      \
+    vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \
+    /* |p3| and |add| hold all inputs now, |p3| will hold output */\
+\
+    mul_##field(p3.Z, p3.Z, add.H);     /* Z3 = H*Z1*Z2 */\
+\
+    sqr_##field(dbl.H, add.H);          /* H^2 */\
+    mul_##field(dbl.R, dbl.H, add.H);   /* H^3 */\
+    mul_##field(dbl.R, dbl.R, p3.Y);    /* H^3*S1 */\
+    mul_##field(p3.Y, dbl.H, p3.X);     /* H^2*U1 */\
+\
+    mul_##field(dbl.H, dbl.H, add.sx);  /* H^2*sx */\
+    sqr_##field(p3.X, add.R);           /* R^2 */\
+    sub_##field(p3.X, p3.X, dbl.H);     /* X3 = R^2-H^2*sx */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* H^2*U1-X3 */\
+    mul_##field(p3.Y, p3.Y, add.R);     /* R*(H^2*U1-X3) */\
+    sub_##field(p3.Y, p3.Y, dbl.R);     /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\
+\
+    vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \
+    vec_select(out, p2, &p3, sizeof(ptype), p1inf); \
+}
+
+/*
+ * Addition with affine point that can handle doubling [as well as
+ * points at infinity, with |p1| being encoded as Z==0 and |p2| as
+ * X,Y==0] in constant time. But at what additional cost? Best
+ * addition result is 7M+4S, while this routine takes 8M+5S, as per
+ *
+ * -------------+-------------
+ * addition     | doubling
+ * -------------+-------------
+ * U1 = X1      | U1 = X2
+ * U2 = X2*Z1^2 |
+ * S1 = Y1      | S1 = Y2
+ * S2 = Y2*Z1^3 |
+ * H = U2-X1    | H' = 2*Y2
+ * R = S2-Y1    | R' = 3*X2^2[+a]
+ * sx = X1+U2   | sx = X2+X2
+ * zz = H*Z1    | zz = H'
+ * -------------+-------------
+ * H!=0 || R!=0 | H==0 && R==0
+ *
+ *      X3 = R^2-H^2*sx
+ *      Y3 = R*(H^2*U1-X3)-H^3*S1
+ *      Z3 = zz
+ *
+ * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is
+ * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0.
+ */
+#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \
+static void ptype##_dadd_affine(ptype *out, const ptype *p1, \
+                                            const ptype##_affine *p2) \
+{ \
+    ptype p3; /* starts as (,, H*Z1) from addition side */\
+    struct { vec##bits H, R, sx; } add, dbl; \
+    bool_t p1inf, p2inf, is_dbl; \
+\
+    p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \
+    add_##field(dbl.sx, p2->X, p2->X);  /* sx = X2+X2 */\
+    sqr_##field(dbl.R, p2->X);          /* X2^2 */\
+    mul_by_3_##field(dbl.R, dbl.R);     /* R = 3*X2^2 */\
+    add_##field(dbl.H, p2->Y, p2->Y);   /* H = 2*Y2 */\
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(add.H, p1->Z);          /* Z1^2 */\
+    mul_##field(add.R, add.H, p1->Z);   /* Z1^3 */\
+    mul_##field(add.R, add.R, p2->Y);   /* S2 = Y2*Z1^3 */\
+    sub_##field(add.R, add.R, p1->Y);   /* R = S2-Y1 */\
+\
+    mul_##field(add.H, add.H, p2->X);   /* U2 = X2*Z1^2 */\
+\
+    add_##field(add.sx, add.H, p1->X);  /* sx = X1+U2 */\
+    sub_##field(add.H, add.H, p1->X);   /* H = U2-X1 */\
+\
+    mul_##field(p3.Z, add.H, p1->Z);    /* Z3 = H*Z1 */\
+\
+    /* make the choice between addition and doubling */ \
+    is_dbl = vec_is_zero(add.H, 2*sizeof(add.H));       \
+    vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl);   \
+    vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\
+    vec_select(&add, &dbl, &add, sizeof(add), is_dbl);  \
+    /* |p3| and |add| hold all inputs now, |p3| will hold output */\
+\
+    sqr_##field(dbl.H, add.H);          /* H^2 */\
+    mul_##field(dbl.R, dbl.H, add.H);   /* H^3 */\
+    mul_##field(dbl.R, dbl.R, p3.Y);    /* H^3*S1 */\
+    mul_##field(p3.Y, dbl.H, p3.X);     /* H^2*U1 */\
+\
+    mul_##field(dbl.H, dbl.H, add.sx);  /* H^2*sx */\
+    sqr_##field(p3.X, add.R);           /* R^2 */\
+    sub_##field(p3.X, p3.X, dbl.H);     /* X3 = R^2-H^2*sx */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* H^2*U1-X3 */\
+    mul_##field(p3.Y, p3.Y, add.R);     /* R*(H^2*U1-X3) */\
+    sub_##field(p3.Y, p3.Y, dbl.R);     /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\
+\
+    vec_select(p3.X, p2,  p3.X, 2*sizeof(p3.X), p1inf); \
+    vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \
+    vec_select(out, p1, &p3, sizeof(ptype), p2inf); \
+}
+
+/*
+ * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
+ * with twist to handle either input at infinity, which are encoded as Z==0.
+ */
+#define POINT_ADD_IMPL(ptype, bits, field) \
+static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \
+{ \
+    ptype p3; \
+    vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \
+    bool_t p1inf, p2inf; \
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+\
+    mul_##field(p3.Z, Z1Z1, p1->Z);     /* Z1*Z1Z1 */\
+    mul_##field(p3.Z, p3.Z, p2->Y);     /* S2 = Y2*Z1*Z1Z1 */\
+\
+    p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+    sqr_##field(Z2Z2, p2->Z);           /* Z2Z2 = Z2^2 */\
+\
+    mul_##field(S1, Z2Z2, p2->Z);       /* Z2*Z2Z2 */\
+    mul_##field(S1, S1, p1->Y);         /* S1 = Y1*Z2*Z2Z2 */\
+\
+    sub_##field(p3.Z, p3.Z, S1);        /* S2-S1 */\
+    add_##field(p3.Z, p3.Z, p3.Z);      /* r = 2*(S2-S1) */\
+\
+    mul_##field(U1, p1->X, Z2Z2);       /* U1 = X1*Z2Z2 */\
+    mul_##field(H,  p2->X, Z1Z1);       /* U2 = X2*Z1Z1 */\
+\
+    sub_##field(H, H, U1);              /* H = U2-U1 */\
+\
+    add_##field(I, H, H);               /* 2*H */\
+    sqr_##field(I, I);                  /* I = (2*H)^2 */\
+\
+    mul_##field(J, H, I);               /* J = H*I */\
+    mul_##field(S1, S1, J);             /* S1*J */\
+\
+    mul_##field(p3.Y, U1, I);           /* V = U1*I */\
+\
+    sqr_##field(p3.X, p3.Z);            /* r^2 */\
+    sub_##field(p3.X, p3.X, J);         /* r^2-J */\
+    sub_##field(p3.X, p3.X, p3.Y);      \
+    sub_##field(p3.X, p3.X, p3.Y);      /* X3 = r^2-J-2*V */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* V-X3 */\
+    mul_##field(p3.Y, p3.Y, p3.Z);      /* r*(V-X3) */\
+    sub_##field(p3.Y, p3.Y, S1);        \
+    sub_##field(p3.Y, p3.Y, S1);        /* Y3 = r*(V-X3)-2*S1*J */\
+\
+    add_##field(p3.Z, p1->Z, p2->Z);    /* Z1+Z2 */\
+    sqr_##field(p3.Z, p3.Z);            /* (Z1+Z2)^2 */\
+    sub_##field(p3.Z, p3.Z, Z1Z1);      /* (Z1+Z2)^2-Z1Z1 */\
+    sub_##field(p3.Z, p3.Z, Z2Z2);      /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\
+    mul_##field(p3.Z, p3.Z, H);         /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\
+\
+    vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \
+    vec_select(out, p2, &p3, sizeof(ptype), p1inf); \
+}
+
+/*
+ * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+ * with twist to handle either input at infinity, with |p1| encoded as Z==0,
+ * and |p2| as X==Y==0.
+ */
+#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \
+static void ptype##_add_affine(ptype *out, const ptype *p1, \
+                                           const ptype##_affine *p2) \
+{ \
+    ptype p3; \
+    vec##bits Z1Z1, H, HH, I, J; \
+    bool_t p1inf, p2inf; \
+\
+    p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+\
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+\
+    mul_##field(p3.Z, Z1Z1, p1->Z);     /* Z1*Z1Z1 */\
+    mul_##field(p3.Z, p3.Z, p2->Y);     /* S2 = Y2*Z1*Z1Z1 */\
+\
+    p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \
+\
+    mul_##field(H, p2->X, Z1Z1);        /* U2 = X2*Z1Z1 */\
+    sub_##field(H, H, p1->X);           /* H = U2-X1 */\
+\
+    sqr_##field(HH, H);                 /* HH = H^2 */\
+    add_##field(I, HH, HH);             \
+    add_##field(I, I, I);               /* I = 4*HH */\
+\
+    mul_##field(p3.Y, p1->X, I);        /* V = X1*I */\
+    mul_##field(J, H, I);               /* J = H*I */\
+    mul_##field(I, J, p1->Y);           /* Y1*J */\
+\
+    sub_##field(p3.Z, p3.Z, p1->Y);     /* S2-Y1 */\
+    add_##field(p3.Z, p3.Z, p3.Z);      /* r = 2*(S2-Y1) */\
+\
+    sqr_##field(p3.X, p3.Z);            /* r^2 */\
+    sub_##field(p3.X, p3.X, J);         /* r^2-J */\
+    sub_##field(p3.X, p3.X, p3.Y);      \
+    sub_##field(p3.X, p3.X, p3.Y);      /* X3 = r^2-J-2*V */\
+\
+    sub_##field(p3.Y, p3.Y, p3.X);      /* V-X3 */\
+    mul_##field(p3.Y, p3.Y, p3.Z);      /* r*(V-X3) */\
+    sub_##field(p3.Y, p3.Y, I);         \
+    sub_##field(p3.Y, p3.Y, I);         /* Y3 = r*(V-X3)-2*Y1*J */\
+\
+    add_##field(p3.Z, p1->Z, H);        /* Z1+H */\
+    sqr_##field(p3.Z, p3.Z);            /* (Z1+H)^2 */\
+    sub_##field(p3.Z, p3.Z, Z1Z1);      /* (Z1+H)^2-Z1Z1 */\
+    sub_##field(p3.Z, p3.Z, HH);        /* Z3 = (Z1+H)^2-Z1Z1-HH */\
+\
+    vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \
+    vec_select(p3.X, p2,  p3.X, 2*sizeof(p3.X), p1inf); \
+    vec_select(out, p1, &p3, sizeof(ptype), p2inf); \
+}
+
+/*
+ * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
+ */
+#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \
+static void ptype##_double(ptype *p3, const ptype *p1) \
+{ \
+    vec##bits A, B, C; \
+\
+    sqr_##field(A, p1->X);              /* A = X1^2 */\
+    sqr_##field(B, p1->Y);              /* B = Y1^2 */\
+    sqr_##field(C, B);                  /* C = B^2 */\
+\
+    add_##field(B, B, p1->X);           /* X1+B */\
+    sqr_##field(B, B);                  /* (X1+B)^2 */\
+    sub_##field(B, B, A);               /* (X1+B)^2-A */\
+    sub_##field(B, B, C);               /* (X1+B)^2-A-C */\
+    add_##field(B, B, B);               /* D = 2*((X1+B)^2-A-C) */\
+\
+    mul_by_3_##field(A, A);             /* E = 3*A */\
+\
+    sqr_##field(p3->X, A);              /* F = E^2 */\
+    sub_##field(p3->X, p3->X, B);       \
+    sub_##field(p3->X, p3->X, B);       /* X3 = F-2*D */\
+\
+    add_##field(p3->Z, p1->Z, p1->Z);   /* 2*Z1 */\
+    mul_##field(p3->Z, p3->Z, p1->Y);   /* Z3 = 2*Z1*Y1 */\
+\
+    mul_by_8_##field(C, C);             /* 8*C */\
+    sub_##field(p3->Y, B, p3->X);       /* D-X3 */\
+    mul_##field(p3->Y, p3->Y, A);       /* E*(D-X3) */\
+    sub_##field(p3->Y, p3->Y, C);       /* Y3 = E*(D-X3)-8*C */\
+}
+
+#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \
+static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \
+{ \
+    mul_##field(pxz->X, p->X, p->Z);    /* X2 = X1*Z1 */\
+    sqr_##field(pxz->Z, p->Z);          \
+    mul_##field(pxz->Z, pxz->Z, p->Z);  /* Z2 = Z1^3 */\
+}
+
+/*
+ * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3
+ * with twist to handle either input at infinity, which are encoded as Z==0.
+ * Just in case, order of doubling and addition is reverse in comparison to
+ * hyperelliptic.org entry. This was done to minimize temporary storage.
+ *
+ * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|.
+ */
+#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \
+static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \
+                                  const ptype##xz *p) \
+{ \
+    ptype##xz p5; \
+    vec##bits A, B, C, D, XX, ZZ; \
+    bool_t r_inf, s_inf; \
+                                        /* s += r */\
+    mul_##field(A, r->X, s->X);         /* A = X2*X3 */\
+    mul_##field(B, r->Z, s->Z);         /* B = Z2*Z3 */\
+    mul_##field(C, r->X, s->Z);         /* C = X2*Z3 */\
+    mul_##field(D, r->Z, s->X);         /* D = X3*Z2 */\
+\
+    sqr_##field(A, A);                  /* (A[-a*B])^2 */\
+    add_##field(p5.X, C, D);            /* C+D */\
+    mul_##field(p5.X, p5.X, B);         /* B*(C+D) */\
+    mul_by_4b_##suffix4b(B, p5.X);      /* b4*B*(C+D) */\
+    sub_##field(p5.X, A, B);            /* (A[-a*B])^2-b4*B*(C+D) */\
+    mul_##field(p5.X, p5.X, p->Z);      /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\
+\
+    sub_##field(p5.Z, C, D);            /* C-D */\
+    sqr_##field(p5.Z, p5.Z);            /* (C-D)^2 */\
+    mul_##field(p5.Z, p5.Z, p->X);      /* Z5 = X1*(C-D)^2 */\
+\
+    r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \
+    s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \
+\
+    vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \
+    vec_select(s,   s, &p5, sizeof(ptype##xz), r_inf); \
+                                        /* r *= 2 */\
+    sqr_##field(XX, r->X);              /* XX = X2^2 */\
+    sqr_##field(ZZ, r->Z);              /* ZZ = Z2^2 */\
+\
+    add_##field(r->Z, r->X, r->Z);      /* X2+Z2 */\
+    sqr_##field(r->Z, r->Z);            /* (X2+Z2)^2 */\
+    sub_##field(r->Z, r->Z, XX);        /* (X2+Z2)^2-XX */\
+    sub_##field(r->Z, r->Z, ZZ);        /* E = (X2+Z2)^2-XX-ZZ */\
+\
+    sqr_##field(A, XX);                 /* (XX[-a*ZZ])^2 */\
+    mul_##field(B, r->Z, ZZ);           /* E*ZZ */\
+    mul_by_4b_##suffix4b(C, B);         /* b4*E*ZZ */\
+    sub_##field(r->X, A, C);            /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\
+\
+    sqr_##field(ZZ, ZZ);                /* ZZ^2 */\
+    mul_by_4b_##suffix4b(B, ZZ);        /* b4*ZZ^2 */\
+    mul_##field(r->Z, r->Z, XX);        /* E*(XX[+a*ZZ]) */\
+    add_##field(r->Z, r->Z, r->Z);      /* 2*E*(XX[+a*ZZ]) */\
+    add_##field(r->Z, r->Z, B);         /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\
+}
+
+/*
+ * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye,
+ * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist
+ * and conversion to Jacobian coordinates from <openssl>/.../ecp_smpl.c,
+ * and with twist to recover from |s| at infinity [which occurs when
+ * multiplying by (order-1)].
+ *
+ * X4 = 2*Y1*X2*Z3*Z1*Z2
+ * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2
+ * Z4 = 2*Y1*Z3*Z2^2*Z1
+ *
+ * Z3x2 = 2*Z3
+ * Y1Z3x2 = Y1*Z3x2
+ * Z1Z2 = Z1*Z2
+ * X1Z2 = X1*Z2
+ * X2Z1 = X2*Z1
+ * X4 = Y1Z3x2*X2*Z1Z2
+ * A = b*Z3x2*(Z1Z2)^2
+ * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1)
+ * C = X3*(X1Z2-X2Z1)^2
+ * Y4 = A+B-C
+ * Z4 = Y1Z3x2*Z1Z2*Z2
+ *
+ * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0.
+ */
+#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##xz_ladder_post(ptype *p4, \
+                                  const ptype##xz *r, const ptype##xz *s, \
+                                  const ptype##xz *p, const vec##bits Y1) \
+{ \
+    vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \
+    bool_t s_inf; \
+\
+    add_##field(Z3x2, s->Z, s->Z);      /* Z3x2 = 2*Z3 */\
+    mul_##field(Y1Z3x2, Y1, Z3x2);      /* Y1Z3x2 = Y1*Z3x2 */\
+    mul_##field(Z1Z2, p->Z, r->Z);      /* Z1Z2 = Z1*Z2 */\
+    mul_##field(X1Z2, p->X, r->Z);      /* X1Z2 = X1*Z2 */\
+    mul_##field(X2Z1, r->X, p->Z);      /* X2Z1 = X2*Z1 */\
+\
+    mul_##field(p4->X, Y1Z3x2, r->X);   /* Y1Z3x2*X2 */\
+    mul_##field(p4->X, p4->X, Z1Z2);    /* X4 = Y1Z3x2*X2*Z1Z2 */\
+\
+    sqr_##field(A, Z1Z2);               /* (Z1Z2)^2 */\
+    mul_##field(B, A, Z3x2);            /* Z3x2*(Z1Z2)^2 */\
+    mul_by_b_##suffixb(A, B);           /* A = b*Z3x2*(Z1Z2)^2 */\
+\
+    mul_##field(B, p->X, r->X);         /* [a*Z1Z2+]X1*X2 */\
+    mul_##field(B, B, s->Z);            /* Z3*([a*Z1Z2+]X1*X2) */\
+    add_##field(C, X1Z2, X2Z1);         /* X1Z2+X2Z1 */\
+    mul_##field(B, B, C);               /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\
+\
+    sub_##field(C, X1Z2, X2Z1);         /* X1Z2-X2Z1 */\
+    sqr_##field(C, C);                  /* (X1Z2-X2Z1)^2 */\
+    mul_##field(C, C, s->X);            /* C = X3*(X1Z2-X2Z1)^2 */\
+\
+    add_##field(A, A, B);               /* A+B */\
+    sub_##field(A, A, C);               /* Y4 = A+B-C */\
+\
+    mul_##field(p4->Z, Z1Z2, r->Z);     /* Z1Z2*Z2 */\
+    mul_##field(p4->Z, p4->Z, Y1Z3x2);  /* Y1Z3x2*Z1Z2*Z2 */\
+\
+    s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \
+    vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \
+    vec_select(p4->Y, Y1,   A,     sizeof(p4->Y), s_inf); \
+    vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \
+    ptype##_cneg(p4, s_inf); \
+                                        /* to Jacobian */\
+    mul_##field(p4->X, p4->X, p4->Z);   /* X4 = X4*Z4 */\
+    sqr_##field(B, p4->Z);              \
+    mul_##field(p4->Y, p4->Y, B);       /* Y4 = Y4*Z4^2 */\
+}
+
+#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \
+static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \
+{ \
+    vec##bits Z1Z1, Z2Z2; \
+    ptype##_affine a1, a2; \
+    bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \
+    bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \
+\
+    sqr_##field(Z1Z1, p1->Z);           /* Z1Z1 = Z1^2 */\
+    sqr_##field(Z2Z2, p2->Z);           /* Z2Z2 = Z2^2 */\
+\
+    mul_##field(a1.X, p1->X, Z2Z2);     /* U1 = X1*Z2Z2 */\
+    mul_##field(a2.X, p2->X, Z1Z1);     /* U2 = X2*Z1Z1 */\
+\
+    mul_##field(a1.Y, p1->Y, p2->Z);    /* Y1*Z2 */\
+    mul_##field(a2.Y, p2->Y, p1->Z);    /* Y2*Z1 */\
+\
+    mul_##field(a1.Y, a1.Y, Z2Z2);      /* S1 = Y1*Z2*Z2Z2 */\
+    mul_##field(a2.Y, a2.Y, Z1Z1);      /* S2 = Y2*Z1*Z1Z1 */\
+\
+    return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle
+ * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5|
+ * and replacing few first references to |X3| in the formula, up to step
+ * 21, with it. 12M[+27A], doubling and infinity are handled by the
+ * formula itself. Infinity is to be encoded as [0, !0, 0].
+ */
+#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \
+                                              const ptype##proj *p2) \
+{ \
+    vec##bits t0, t1, t2, t3, t4, t5; \
+\
+    mul_##field(t0, p1->X, p2->X);      /* 1.     t0 = X1*X2 */\
+    mul_##field(t1, p1->Y, p2->Y);      /* 2.     t1 = Y1*Y2 */\
+    mul_##field(t2, p1->Z, p2->Z);      /* 3.     t2 = Z1*Z2 */\
+    add_##field(t3, p1->X, p1->Y);      /* 4.     t3 = X1+Y1 */\
+    add_##field(t4, p2->X, p2->Y);      /* 5.     t4 = X2+Y2 */\
+    mul_##field(t3, t3, t4);            /* 6.     t3 = t3*t4 */\
+    add_##field(t4, t0, t1);            /* 7.     t4 = t0+t1 */\
+    sub_##field(t3, t3, t4);            /* 8.     t3 = t3-t4 */\
+    add_##field(t4, p1->Y, p1->Z);      /* 9.     t4 = Y1+Z1 */\
+    add_##field(t5, p2->Y, p2->Z);      /* 10.    t5 = Y2+Z2 */\
+    mul_##field(t4, t4, t5);            /* 11.    t4 = t4*t5 */\
+    add_##field(t5, t1, t2);            /* 12.    t5 = t1+t2 */\
+    sub_##field(t4, t4, t5);            /* 13.    t4 = t4-t5 */\
+    add_##field(t5, p1->X, p1->Z);      /* 14.    t5 = X1+Z1 */\
+    add_##field(p3->Y, p2->X, p2->Z);   /* 15.    Y3 = X2+Z2 */\
+    mul_##field(t5, t5, p3->Y);         /* 16.    t5 = t5*Y3 */\
+    add_##field(p3->Y, t0, t2);         /* 17.    Y3 = t0+t2 */\
+    sub_##field(p3->Y, t5, p3->Y);      /* 18.    Y3 = t5-Y3 */\
+    mul_by_3_##field(t0, t0);           /* 19-20. t0 = 3*t0  */\
+    mul_by_3_##field(t5, t2);           /* 21.    t5 = 3*t2  */\
+    mul_by_b_##suffixb(t2, t5);         /* 21.    t2 = b*t5  */\
+    add_##field(p3->Z, t1, t2);         /* 22.    Z3 = t1+t2 */\
+    sub_##field(t1, t1, t2);            /* 23.    t1 = t1-t2 */\
+    mul_by_3_##field(t5, p3->Y);        /* 24.    t5 = 3*Y3  */\
+    mul_by_b_##suffixb(p3->Y, t5);      /* 24.    Y3 = b*t5  */\
+    mul_##field(p3->X, t4, p3->Y);      /* 25.    X3 = t4*Y3 */\
+    mul_##field(t2, t3, t1);            /* 26.    t2 = t3*t1 */\
+    sub_##field(p3->X, t2, p3->X);      /* 27.    X3 = t2-X3 */\
+    mul_##field(p3->Y, p3->Y, t0);      /* 28.    Y3 = Y3*t0 */\
+    mul_##field(t1, t1, p3->Z);         /* 29.    t1 = t1*Z3 */\
+    add_##field(p3->Y, t1, p3->Y);      /* 30.    Y3 = t1+Y3 */\
+    mul_##field(t0, t0, t3);            /* 31.    t0 = t0*t3 */\
+    mul_##field(p3->Z, p3->Z, t4);      /* 32.    Z3 = Z3*t4 */\
+    add_##field(p3->Z, p3->Z, t0);      /* 33.    Z3 = Z3+t0 */\
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle
+ * |p2| being infinity encoded as [0, 0]. 11M[+21A].
+ */
+#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \
+                                                      const ptype##_affine *p2) \
+{ \
+    ptype##proj p3[1]; \
+    vec##bits t0, t1, t2, t3, t4; \
+    limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \
+\
+    mul_##field(t0, p1->X, p2->X);      /* 1.     t0 = X1*X2 */\
+    mul_##field(t1, p1->Y, p2->Y);      /* 2.     t1 = Y1*Y2 */\
+    add_##field(t3, p1->X, p1->Y);      /* 3.     t3 = X1+Y1 */\
+    add_##field(t4, p2->X, p2->Y);      /* 4.     t4 = X2+Y2 */\
+    mul_##field(t3, t3, t4);            /* 5.     t3 = t3*t4 */\
+    add_##field(t4, t0, t1);            /* 6.     t4 = t0+t1 */\
+    sub_##field(t3, t3, t4);            /* 7.     t3 = t3-t4 */\
+    mul_##field(t4, p2->Y, p1->Z);      /* 8.     t4 = Y2*Z1 */\
+    add_##field(t4, t4, p1->Y);         /* 9.     t4 = t4+Y1 */\
+    mul_##field(p3->Y, p2->X, p1->Z);   /* 10.    Y3 = X2*Z1 */\
+    add_##field(p3->Y, p3->Y, p1->X);   /* 11.    Y3 = Y3+X1 */\
+    mul_by_3_##field(t0, t0);           /* 12-13. t0 = 3*t0  */\
+    mul_by_b_##suffixb(t2, p1->Z);      /* 14.    t2 = b*Z1  */\
+    mul_by_3_##field(t2, t2);           /* 14.    t2 = 3*t2  */\
+    add_##field(p3->Z, t1, t2);         /* 15.    Z3 = t1+t2 */\
+    sub_##field(t1, t1, t2);            /* 16.    t1 = t1-t2 */\
+    mul_by_b_##suffixb(t2, p3->Y);      /* 17.    t2 = b*Y3  */\
+    mul_by_3_##field(p3->Y, t2);        /* 17.    Y3 = 3*t2  */\
+    mul_##field(p3->X, t4, p3->Y);      /* 18.    X3 = t4*Y3 */\
+    mul_##field(t2, t3, t1);            /* 19.    t2 = t3*t1 */\
+    sub_##field(p3->X, t2, p3->X);      /* 20.    X3 = t2-X3 */\
+    mul_##field(p3->Y, p3->Y, t0);      /* 21.    Y3 = Y3*t0 */\
+    mul_##field(t1, t1, p3->Z);         /* 22.    t1 = t1*Z3 */\
+    add_##field(p3->Y, t1, p3->Y);      /* 23.    Y3 = t1+Y3 */\
+    mul_##field(t0, t0, t3);            /* 24.    t0 = t0*t3 */\
+    mul_##field(p3->Z, p3->Z, t4);      /* 25.    Z3 = Z3*t4 */\
+    add_##field(p3->Z, p3->Z, t0);      /* 26.    Z3 = Z3+t0 */\
+\
+    vec_select(out, p1, p3, sizeof(*out), p2inf); \
+}
+
+/*
+ * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle
+ * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y
+ * and reordering operations to bring references to |p1| forward.
+ * 6M+2S[+13A].
+ */
+#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \
+static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \
+{ \
+    vec##bits t0, t1, t2, t3; \
+\
+    sqr_##field(t0, p1->Y);             /* 1.     t0 = Y*Y   */\
+    mul_##field(t1, p1->Y, p1->Z);      /* 5.     t1 = Y*Z   */\
+    sqr_##field(t2, p1->Z);             /* 6.     t2 = Z*Z   */\
+    mul_##field(t3, p1->X, p1->Y);      /* 16.    t3 = X*Y   */\
+    lshift_##field(p3->Z, t0, 3);       /* 2-4.   Z3 = 8*t0  */\
+    mul_by_b_##suffixb(p3->X, t2);      /* 7.     t2 = b*t2  */\
+    mul_by_3_##field(t2, p3->X);        /* 7.     t2 = 3*t2  */\
+    mul_##field(p3->X, t2, p3->Z);      /* 8.     X3 = t2*Z3 */\
+    add_##field(p3->Y, t0, t2);         /* 9.     Y3 = t0+t2 */\
+    mul_##field(p3->Z, t1, p3->Z);      /* 10.    Z3 = t1*Z3 */\
+    mul_by_3_##field(t2, t2);           /* 11-12. t2 = 3*t2  */\
+    sub_##field(t0, t0, t2);            /* 13.    t0 = t0-t2 */\
+    mul_##field(p3->Y, t0, p3->Y);      /* 14.    Y3 = t0*Y3 */\
+    add_##field(p3->Y, p3->X, p3->Y);   /* 15.    Y3 = X3+Y3 */\
+    mul_##field(p3->X, t0, t3);         /* 17.    X3 = t0*t3 */\
+    add_##field(p3->X, p3->X, p3->X);   /* 18.    X3 = X3+X3 */\
+}
+
+#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \
+static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \
+{ \
+    vec##bits ZZ; \
+\
+    sqr_##field(ZZ, in->Z); \
+    mul_##field(out->X, in->X, in->Z); \
+    mul_##field(out->Y, in->Y, ZZ); \
+    vec_copy(out->Z, in->Z, sizeof(out->Z)); \
+}
+
+#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \
+static void ptype##_to_projective(ptype##proj *out, const ptype *in) \
+{ \
+    vec##bits ZZ; \
+    limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \
+\
+    sqr_##field(ZZ, in->Z); \
+    mul_##field(out->X, in->X, in->Z); \
+    vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \
+    mul_##field(out->Z, ZZ, in->Z); \
+}
+
+/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/
+
+/*
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+ * with twist to handle either input at infinity. Addition costs 12M+2S,
+ * while conditional doubling - 4M+6M+3S.
+ */
+#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \
+static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \
+                                              const ptype##xyzz *p2) \
+{ \
+    vec##bits U, S, P, R; \
+\
+    if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \
+        vec_copy(p3, p1, sizeof(*p3));  \
+        return; \
+    } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \
+        vec_copy(p3, p2, sizeof(*p3));  \
+        return; \
+    } \
+\
+    mul_##field(U, p1->X, p2->ZZ);              /* U1 = X1*ZZ2 */\
+    mul_##field(S, p1->Y, p2->ZZZ);             /* S1 = Y1*ZZZ2 */\
+    mul_##field(P, p2->X, p1->ZZ);              /* U2 = X2*ZZ1 */\
+    mul_##field(R, p2->Y, p1->ZZZ);             /* S2 = Y2*ZZZ1 */\
+    sub_##field(P, P, U);                       /* P = U2-U1 */\
+    sub_##field(R, R, S);                       /* R = S2-S1 */\
+\
+    if (!vec_is_zero(P, sizeof(P))) {           /* X1!=X2 */\
+        vec##bits PP, PPP, Q;                   /* add |p1| and |p2| */\
+\
+        sqr_##field(PP, P);                     /* PP = P^2 */\
+        mul_##field(PPP, PP, P);                /* PPP = P*PP */\
+        mul_##field(Q, U, PP);                  /* Q = U1*PP */\
+        sqr_##field(p3->X, R);                  /* R^2 */\
+        add_##field(P, Q, Q); \
+        sub_##field(p3->X, p3->X, PPP);         /* R^2-PPP */\
+        sub_##field(p3->X, p3->X, P);           /* X3 = R^2-PPP-2*Q */\
+        sub_##field(Q, Q, p3->X); \
+        mul_##field(Q, Q, R);                   /* R*(Q-X3) */\
+        mul_##field(p3->Y, S, PPP);             /* S1*PPP */\
+        sub_##field(p3->Y, Q, p3->Y);           /* Y3 = R*(Q-X3)-S1*PPP */\
+        mul_##field(p3->ZZ, p1->ZZ, p2->ZZ);    /* ZZ1*ZZ2 */\
+        mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\
+        mul_##field(p3->ZZ, p3->ZZ, PP);        /* ZZ3 = ZZ1*ZZ2*PP */\
+        mul_##field(p3->ZZZ, p3->ZZZ, PPP);     /* ZZZ3 = ZZZ1*ZZZ2*PPP */\
+    } else if (vec_is_zero(R, sizeof(R))) {     /* X1==X2 && Y1==Y2 */\
+        vec##bits V, W, M;                      /* double |p1| */\
+\
+        add_##field(U, p1->Y, p1->Y);           /* U = 2*Y1 */\
+        sqr_##field(V, U);                      /* V = U^2 */\
+        mul_##field(W, V, U);                   /* W = U*V */\
+        mul_##field(S, p1->X, V);               /* S = X1*V */\
+        sqr_##field(M, p1->X); \
+        mul_by_3_##field(M, M);                 /* M = 3*X1^2[+a*ZZ1^2] */\
+        sqr_##field(p3->X, M); \
+        add_##field(U, S, S);                   /* 2*S */\
+        sub_##field(p3->X, p3->X, U);           /* X3 = M^2-2*S */\
+        mul_##field(p3->Y, W, p1->Y);           /* W*Y1 */\
+        sub_##field(S, S, p3->X); \
+        mul_##field(S, S, M);                   /* M*(S-X3) */\
+        sub_##field(p3->Y, S, p3->Y);           /* Y3 = M*(S-X3)-W*Y1 */\
+        mul_##field(p3->ZZ, p1->ZZ, V);         /* ZZ3 = V*ZZ1 */\
+        mul_##field(p3->ZZZ, p1->ZZZ, W);       /* ZZ3 = W*ZZZ1 */\
+    } else {                                    /* X1==X2 && Y1==-Y2 */\
+        vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ));   /* set |p3| to infinity */\
+    } \
+}
+
+/*
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s
+ * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1
+ * with twists to handle even subtractions and either input at infinity.
+ * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S.
+ */
+#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \
+static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \
+                                                     const ptype##_affine *p2, \
+                                                     bool_t subtract) \
+{ \
+    vec##bits P, R; \
+\
+    if (vec_is_zero(p2, sizeof(*p2))) { \
+        vec_copy(p3, p1, sizeof(*p3));  \
+        return; \
+    } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \
+        vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\
+        cneg_##field(p3->ZZZ, one, subtract);   \
+        vec_copy(p3->ZZ, one, sizeof(p3->ZZ));  \
+        return; \
+    } \
+\
+    mul_##field(P, p2->X, p1->ZZ);              /* U2 = X2*ZZ1 */\
+    mul_##field(R, p2->Y, p1->ZZZ);             /* S2 = Y2*ZZZ1 */\
+    cneg_##field(R, R, subtract); \
+    sub_##field(P, P, p1->X);                   /* P = U2-X1 */\
+    sub_##field(R, R, p1->Y);                   /* R = S2-Y1 */\
+\
+    if (!vec_is_zero(P, sizeof(P))) {           /* X1!=X2 */\
+        vec##bits PP, PPP, Q;                   /* add |p2| to |p1| */\
+\
+        sqr_##field(PP, P);                     /* PP = P^2 */\
+        mul_##field(PPP, PP, P);                /* PPP = P*PP */\
+        mul_##field(Q, p1->X, PP);              /* Q = X1*PP */\
+        sqr_##field(p3->X, R);                  /* R^2 */\
+        add_##field(P, Q, Q); \
+        sub_##field(p3->X, p3->X, PPP);         /* R^2-PPP */\
+        sub_##field(p3->X, p3->X, P);           /* X3 = R^2-PPP-2*Q */\
+        sub_##field(Q, Q, p3->X); \
+        mul_##field(Q, Q, R);                   /* R*(Q-X3) */\
+        mul_##field(p3->Y, p1->Y, PPP);         /* Y1*PPP */\
+        sub_##field(p3->Y, Q, p3->Y);           /* Y3 = R*(Q-X3)-Y1*PPP */\
+        mul_##field(p3->ZZ, p1->ZZ, PP);        /* ZZ3 = ZZ1*PP */\
+        mul_##field(p3->ZZZ, p1->ZZZ, PPP);     /* ZZZ3 = ZZZ1*PPP */\
+    } else if (vec_is_zero(R, sizeof(R))) {     /* X1==X2 && Y1==Y2 */\
+        vec##bits U, S, M;                      /* double |p2| */\
+\
+        add_##field(U, p2->Y, p2->Y);           /* U = 2*Y1 */\
+        sqr_##field(p3->ZZ, U);                 /* [ZZ3 =] V = U^2 */\
+        mul_##field(p3->ZZZ, p3->ZZ, U);        /* [ZZZ3 =] W = U*V */\
+        mul_##field(S, p2->X, p3->ZZ);          /* S = X1*V */\
+        sqr_##field(M, p2->X); \
+        mul_by_3_##field(M, M);                 /* M = 3*X1^2[+a] */\
+        sqr_##field(p3->X, M); \
+        add_##field(U, S, S);                   /* 2*S */\
+        sub_##field(p3->X, p3->X, U);           /* X3 = M^2-2*S */\
+        mul_##field(p3->Y, p3->ZZZ, p2->Y);     /* W*Y1 */\
+        sub_##field(S, S, p3->X); \
+        mul_##field(S, S, M);                   /* M*(S-X3) */\
+        sub_##field(p3->Y, S, p3->Y);           /* Y3 = M*(S-X3)-W*Y1 */\
+        cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \
+    } else {                                    /* X1==X2 && Y1==-Y2 */\
+        vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ));   /* set |p3| to infinity */\
+    } \
+}
+
+#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \
+static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \
+{ \
+    mul_##field(out->X, in->X, in->ZZ); \
+    mul_##field(out->Y, in->Y, in->ZZZ); \
+    vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \
+}
+
+#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \
+static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \
+{ \
+    vec_copy(out->X, in->X, 2*sizeof(out->X)); \
+    sqr_##field(out->ZZ, in->Z); \
+    mul_##field(out->ZZZ, out->ZZ, in->Z); \
+}
+
+#endif
diff --git a/crypto/blst_src/errors.h b/crypto/blst_src/errors.h
new file mode 100644
index 00000000000..425daeb486f
--- /dev/null
+++ b/crypto/blst_src/errors.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_ERRORS_H__
+#define __BLS12_381_ASM_ERRORS_H__
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+} BLST_ERROR;
+
+#endif
diff --git a/crypto/blst_src/exp.c b/crypto/blst_src/exp.c
new file mode 100644
index 00000000000..55c5c5a7875
--- /dev/null
+++ b/crypto/blst_src/exp.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "vect.h"
+
+/*
+ * |out| = |inp|^|pow|, small footprint, public exponent
+ */
+static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow,
+                         size_t pow_bits, const vec384 p, limb_t n0)
+{
+#if 1
+    vec384 ret;
+
+    vec_copy(ret, inp, sizeof(ret));  /* ret = inp^1 */
+    --pow_bits; /* most significant bit is set, skip over */
+    while (pow_bits--) {
+        sqr_mont_384(ret, ret, p, n0);
+        if (is_bit_set(pow, pow_bits))
+            mul_mont_384(ret, ret, inp, p, n0);
+    }
+    vec_copy(out, ret, sizeof(ret));  /* out = ret */
+#else
+    unsigned int i;
+    vec384 sqr;
+
+    vec_copy(sqr, inp, sizeof(sqr));
+    for (i = 0; !is_bit_set(pow, i++);)
+        sqr_mont_384(sqr, sqr, sqr, p, n0);
+    vec_copy(out, sqr, sizeof(sqr));
+    for (; i < pow_bits; i++) {
+        sqr_mont_384(sqr, sqr, sqr, p, n0);
+        if (is_bit_set(pow, i))
+            mul_mont_384(out, out, sqr, p, n0);
+    }
+#endif
+}
+
+static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow,
+                          size_t pow_bits, const vec384 p, limb_t n0)
+{
+    vec384x ret;
+
+    vec_copy(ret, inp, sizeof(ret));  /* |ret| = |inp|^1 */
+    --pow_bits; /* most significant bit is accounted for, skip over */
+    while (pow_bits--) {
+        sqr_mont_384x(ret, ret, p, n0);
+        if (is_bit_set(pow, pow_bits))
+            mul_mont_384x(ret, ret, inp, p, n0);
+    }
+    vec_copy(out, ret, sizeof(ret));  /* |out| = |ret| */
+}
diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c
new file mode 100644
index 00000000000..ad720999883
--- /dev/null
+++ b/crypto/blst_src/exports.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * Why this file? Overall goal is to ensure that all internal calls
+ * remain internal after linking application. This is to both
+ *
+ * a) minimize possibility of external name conflicts (since all
+ *    non-blst-prefixed and [assembly subroutines] remain static);
+ * b) preclude possibility of unintentional internal reference
+ *    overload in shared library context (one can achieve same
+ *    effect with -Bsymbolic, but we don't want to rely on end-user
+ *    to remember to use it);
+ */
+
+#include "fields.h"
+#include "bytes.h"
+
+/*
+ * BLS12-381-specifc Fr shortcuts to assembly.
+ */
+void blst_fr_add(vec256 ret, const vec256 a, const vec256 b)
+{   add_mod_256(ret, a, b, BLS12_381_r);   }
+
+void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b)
+{   sub_mod_256(ret, a, b, BLS12_381_r);   }
+
+void blst_fr_mul_by_3(vec256 ret, const vec256 a)
+{   mul_by_3_mod_256(ret, a, BLS12_381_r);   }
+
+void blst_fr_lshift(vec256 ret, const vec256 a, size_t count)
+{   lshift_mod_256(ret, a, count, BLS12_381_r);   }
+
+void blst_fr_rshift(vec256 ret, const vec256 a, size_t count)
+{   rshift_mod_256(ret, a, count, BLS12_381_r);   }
+
+void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b)
+{   mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0);   }
+
+void blst_fr_sqr(vec256 ret, const vec256 a)
+{   sqr_mont_sparse_256(ret, a, BLS12_381_r, r0);   }
+
+void blst_fr_cneg(vec256 ret, const vec256 a, int flag)
+{   cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r);   }
+
+void blst_fr_to(vec256 ret, const vec256 a)
+{   mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0);   }
+
+void blst_fr_from(vec256 ret, const vec256 a)
+{   from_mont_256(ret, a, BLS12_381_r, r0);   }
+
+void blst_fr_from_scalar(vec256 ret, const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if ((uptr_t)ret == (uptr_t)a && is_endian.little) {
+        mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR,
+                                                    BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        limbs_from_le_bytes(out, a, 32);
+        mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+void blst_scalar_from_fr(pow256 ret, const vec256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if ((uptr_t)ret == (uptr_t)a && is_endian.little) {
+        from_mont_256((limb_t *)ret, a, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        from_mont_256(out, a, BLS12_381_r, r0);
+        le_bytes_from_limbs(ret, out, 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+int blst_scalar_fr_check(const pow256 a)
+{   return (int)(check_mod_256(a, BLS12_381_r) |
+                 bytes_are_zero(a, sizeof(pow256)));
+}
+
+int blst_sk_check(const pow256 a)
+{   return (int)check_mod_256(a, BLS12_381_r);   }
+
+int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b)
+{   return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r);   }
+
+int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b)
+{   return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r);   }
+
+int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b)
+{
+    vec256 a_fr, b_fr;
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) {
+        limbs_from_le_bytes(a_fr, a, sizeof(a_fr));
+        limbs_from_le_bytes(b_fr, b, sizeof(a_fr));
+        a = (const byte *)a_fr;
+        b = (const byte *)b_fr;
+    }
+    mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR,
+                                                 BLS12_381_r, r0);
+    mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR,
+                                                 BLS12_381_r, r0);
+    mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0);
+    from_mont_256(a_fr, a_fr, BLS12_381_r, r0);
+    le_bytes_from_limbs(ret, a_fr, sizeof(a_fr));
+
+    return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1);
+}
+
+void blst_sk_inverse(pow256 ret, const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) {
+        limb_t *out = (limb_t *)ret;
+        mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR,
+                                                    BLS12_381_r, r0);
+        reciprocal_fr(out, out);
+        from_mont_256(out, out, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        limbs_from_le_bytes(out, a, 32);
+        mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0);
+        reciprocal_fr(out, out);
+        from_mont_256(out, out, BLS12_381_r, r0);
+        le_bytes_from_limbs(ret, out, 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+/*
+ * BLS12-381-specifc Fp shortcuts to assembly.
+ */
+void blst_fp_add(vec384 ret, const vec384 a, const vec384 b)
+{   add_fp(ret, a, b);   }
+
+void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b)
+{   sub_fp(ret, a, b);   }
+
+void blst_fp_mul_by_3(vec384 ret, const vec384 a)
+{   mul_by_3_fp(ret, a);   }
+
+void blst_fp_mul_by_8(vec384 ret, const vec384 a)
+{   mul_by_8_fp(ret, a);   }
+
+void blst_fp_lshift(vec384 ret, const vec384 a, size_t count)
+{   lshift_fp(ret, a, count);   }
+
+void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b)
+{   mul_fp(ret, a, b);   }
+
+void blst_fp_sqr(vec384 ret, const vec384 a)
+{   sqr_fp(ret, a);   }
+
+void blst_fp_cneg(vec384 ret, const vec384 a, int flag)
+{   cneg_fp(ret, a, is_zero(flag) ^ 1);   }
+
+void blst_fp_to(vec384 ret, const vec384 a)
+{   mul_fp(ret, a, BLS12_381_RR);   }
+
+void blst_fp_from(vec384 ret, const vec384 a)
+{   from_fp(ret, a);   }
+
+/*
+ * Fp serialization/deserialization.
+ */
+void blst_fp_from_uint32(vec384 ret, const unsigned int a[12])
+{
+    if (sizeof(limb_t) == 8) {
+        int i;
+        for (i = 0; i < 6; i++)
+            ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1)));
+        a = (const unsigned int *)ret;
+    }
+    mul_fp(ret, (const limb_t *)a, BLS12_381_RR);
+}
+
+void blst_uint32_from_fp(unsigned int ret[12], const vec384 a)
+{
+    if (sizeof(limb_t) == 4) {
+        from_fp((limb_t *)ret, a);
+    } else {
+        vec384 out;
+        int i;
+
+        from_fp(out, a);
+        for (i = 0; i < 6; i++) {
+            limb_t limb = out[i];
+            ret[2*i]   = (unsigned int)limb;
+            ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1)));
+        }
+    }
+}
+
+void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 4 && !is_endian.little) {
+        int i;
+        for (i = 0; i < 6; i++) {
+            unsigned long long limb = a[i];
+            ret[2*i]   = (limb_t)limb;
+            ret[2*i+1] = (limb_t)(limb >> 32);
+        }
+        a = (const unsigned long long *)ret;
+    }
+    mul_fp(ret, (const limb_t *)a, BLS12_381_RR);
+}
+
+void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 8 || is_endian.little) {
+        from_fp((limb_t *)ret, a);
+    } else {
+        vec384 out;
+        int i;
+
+        from_fp(out, a);
+        for (i = 0; i < 6; i++)
+            ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32);
+    }
+}
+
+void blst_fp_from_bendian(vec384 ret, const unsigned char a[48])
+{
+    vec384 out;
+
+    limbs_from_be_bytes(out, a, sizeof(vec384));
+    mul_fp(ret, out, BLS12_381_RR);
+}
+
+void blst_bendian_from_fp(unsigned char ret[48], const vec384 a)
+{
+    vec384 out;
+
+    from_fp(out, a);
+    be_bytes_from_limbs(ret, out, sizeof(vec384));
+}
+
+void blst_fp_from_lendian(vec384 ret, const unsigned char a[48])
+{
+    vec384 out;
+
+    limbs_from_le_bytes(out, a, sizeof(vec384));
+    mul_fp(ret, out, BLS12_381_RR);
+}
+
+void blst_lendian_from_fp(unsigned char ret[48], const vec384 a)
+{
+    vec384 out;
+
+    from_fp(out, a);
+    le_bytes_from_limbs(ret, out, sizeof(vec384));
+}
+
+/*
+ * BLS12-381-specifc Fp2 shortcuts to assembly.
+ */
+void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b)
+{   add_fp2(ret, a, b);   }
+
+void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b)
+{   sub_fp2(ret, a, b);   }
+
+void blst_fp2_mul_by_3(vec384x ret, const vec384x a)
+{   mul_by_3_fp2(ret, a);   }
+
+void blst_fp2_mul_by_8(vec384x ret, const vec384x a)
+{   mul_by_8_fp2(ret, a);   }
+
+void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count)
+{   lshift_fp2(ret, a, count);    }
+
+void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b)
+{   mul_fp2(ret, a, b);   }
+
+void blst_fp2_sqr(vec384x ret, const vec384x a)
+{   sqr_fp2(ret, a);   }
+
+void blst_fp2_cneg(vec384x ret, const vec384x a, int flag)
+{   cneg_fp2(ret, a, is_zero(flag) ^ 1);   }
+
+/*
+ * Scalar serialization/deseriazation
+ */
+void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 8; i++) {
+        unsigned int w = a[i];
+        *ret++ = (byte)w;
+        *ret++ = (byte)(w >> 8);
+        *ret++ = (byte)(w >> 16);
+        *ret++ = (byte)(w >> 24);
+    }
+}
+
+void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 8; i++) {
+        unsigned int w = (unsigned int)(*a++);
+        w |= (unsigned int)(*a++) << 8;
+        w |= (unsigned int)(*a++) << 16;
+        w |= (unsigned int)(*a++) << 24;
+        ret[i] = w;
+    }
+}
+
+void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 4; i++) {
+        unsigned long long w = a[i];
+        *ret++ = (byte)w;
+        *ret++ = (byte)(w >> 8);
+        *ret++ = (byte)(w >> 16);
+        *ret++ = (byte)(w >> 24);
+        *ret++ = (byte)(w >> 32);
+        *ret++ = (byte)(w >> 40);
+        *ret++ = (byte)(w >> 48);
+        *ret++ = (byte)(w >> 56);
+    }
+}
+
+void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a && is_endian.little)
+        return;
+
+    for(i = 0; i < 4; i++) {
+        unsigned long long w = (unsigned long long)(*a++);
+        w |= (unsigned long long)(*a++) << 8;
+        w |= (unsigned long long)(*a++) << 16;
+        w |= (unsigned long long)(*a++) << 24;
+        w |= (unsigned long long)(*a++) << 32;
+        w |= (unsigned long long)(*a++) << 40;
+        w |= (unsigned long long)(*a++) << 48;
+        w |= (unsigned long long)(*a++) << 56;
+        ret[i] = w;
+    }
+}
+
+void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32])
+{
+    vec256 out;
+    limbs_from_be_bytes(out, a, sizeof(out));
+    le_bytes_from_limbs(ret, out, sizeof(out));
+    vec_zero(out, sizeof(out));
+}
+
+void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a)
+{
+    vec256 out;
+    limbs_from_le_bytes(out, a, sizeof(out));
+    be_bytes_from_limbs(ret, out, sizeof(out));
+    vec_zero(out, sizeof(out));
+}
+
+void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32])
+{
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a)
+        return;
+
+    for (i = 0; i < 32; i++)
+        ret[i] = a[i];
+}
+
+void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a)
+{
+    size_t i;
+
+    if ((uptr_t)ret==(uptr_t)a)
+        return;
+
+    for (i = 0; i < 32; i++)
+        ret[i] = a[i];
+}
+
+void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4])
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 4 && !is_endian.little) {
+        int i;
+        for (i = 0; i < 4; i++) {
+            unsigned long long limb = a[i];
+            ret[2*i]   = (limb_t)limb;
+            ret[2*i+1] = (limb_t)(limb >> 32);
+        }
+        a = (const unsigned long long *)ret;
+    }
+    mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a)
+{
+    const union {
+        long one;
+        char little;
+    } is_endian = { 1 };
+
+    if (sizeof(limb_t) == 8 || is_endian.little) {
+        from_mont_256((limb_t *)ret, a, BLS12_381_r, r0);
+    } else {
+        vec256 out;
+        int i;
+
+        from_mont_256(out, a, BLS12_381_r, r0);
+        for (i = 0; i < 4; i++)
+            ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32);
+        vec_zero(out, sizeof(out));
+    }
+}
+
+int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n)
+{
+    struct { vec256 out, digit, radix; } t;
+    limb_t ret;
+
+    vec_zero(t.out, sizeof(t.out));
+    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
+
+    while (n > 32) {
+        limbs_from_le_bytes(t.digit, bytes, 32);
+        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+        add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
+        bytes += 32;
+        n -= 32;
+    }
+
+    vec_zero(t.digit, sizeof(t.digit));
+    limbs_from_le_bytes(t.digit, bytes, n);
+    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+
+    ret = vec_is_zero(t.out, sizeof(t.out));
+    le_bytes_from_limbs(out, t.out, 32);
+    vec_zero(t.out, 2*sizeof(t.out));
+
+    return (int)(ret^1);
+}
+
+int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n)
+{
+    struct { vec256 out, digit, radix; } t;
+    limb_t ret;
+
+    vec_zero(t.out, sizeof(t.out));
+    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
+
+    bytes += n;
+    while (n > 32) {
+        limbs_from_be_bytes(t.digit, bytes -= 32, 32);
+        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+        add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
+        n -= 32;
+    }
+
+    vec_zero(t.digit, sizeof(t.digit));
+    limbs_from_be_bytes(t.digit, bytes -= n, n);
+    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
+    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+
+    ret = vec_is_zero(t.out, sizeof(t.out));
+    le_bytes_from_limbs(out, t.out, 32);
+    vec_zero(t.out, 2*sizeof(t.out));
+
+    return (int)(ret^1);
+}
+
+/*
+ * Test facilitator
+ */
+void blst_scalar_from_hexascii(pow256 ret, const char *hex)
+{   bytes_from_hexascii(ret, sizeof(pow256), hex);   }
+
+void blst_fr_from_hexascii(vec256 ret, const char *hex)
+{
+    limbs_from_hexascii(ret, sizeof(vec256), hex);
+    mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void blst_fp_from_hexascii(vec384 ret, const char *hex)
+{
+    limbs_from_hexascii(ret, sizeof(vec384), hex);
+    mul_fp(ret, ret, BLS12_381_RR);
+}
diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h
new file mode 100644
index 00000000000..515219f62dd
--- /dev/null
+++ b/crypto/blst_src/fields.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_FIELDS_H__
+#define __BLS12_381_ASM_FIELDS_H__
+
+#include "vect.h"
+#include "consts.h"
+
+/*
+ * BLS12-381-specifc Fp shortcuts to assembly.
+ */
+static inline void add_fp(vec384 ret, const vec384 a, const vec384 b)
+{   add_mod_384(ret, a, b, BLS12_381_P);   }
+
+static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b)
+{   sub_mod_384(ret, a, b, BLS12_381_P);   }
+
+static inline void mul_by_3_fp(vec384 ret, const vec384 a)
+{   mul_by_3_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void mul_by_8_fp(vec384 ret, const vec384 a)
+{   mul_by_8_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void lshift_fp(vec384 ret, const vec384 a, size_t count)
+{   lshift_mod_384(ret, a, count, BLS12_381_P);   }
+
+static inline void rshift_fp(vec384 ret, const vec384 a, size_t count)
+{   rshift_mod_384(ret, a, count, BLS12_381_P);   }
+
+static inline void div_by_2_fp(vec384 ret, const vec384 a)
+{   div_by_2_mod_384(ret, a, BLS12_381_P);   }
+
+static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b)
+{   mul_mont_384(ret, a, b, BLS12_381_P, p0);   }
+
+static inline void sqr_fp(vec384 ret, const vec384 a)
+{   sqr_mont_384(ret, a, BLS12_381_P, p0);   }
+
+static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag)
+{   cneg_mod_384(ret, a, flag, BLS12_381_P);   }
+
+static inline void from_fp(vec384 ret, const vec384 a)
+{   from_mont_384(ret, a, BLS12_381_P, p0);   }
+
+static inline void redc_fp(vec384 ret, const vec768 a)
+{   redc_mont_384(ret, a, BLS12_381_P, p0);   }
+
+/*
+ * BLS12-381-specifc Fp2 shortcuts to assembly.
+ */
+static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   add_mod_384x(ret, a, b, BLS12_381_P);   }
+
+static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   sub_mod_384x(ret, a, b, BLS12_381_P);   }
+
+static inline void mul_by_3_fp2(vec384x ret, const vec384x a)
+{   mul_by_3_mod_384x(ret, a, BLS12_381_P);   }
+
+static inline void mul_by_8_fp2(vec384x ret, const vec384x a)
+{   mul_by_8_mod_384x(ret, a, BLS12_381_P);   }
+
+static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count)
+{
+    lshift_mod_384(ret[0], a[0], count, BLS12_381_P);
+    lshift_mod_384(ret[1], a[1], count, BLS12_381_P);
+}
+
+static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b)
+{   mul_mont_384x(ret, a, b, BLS12_381_P, p0);   }
+
+static inline void sqr_fp2(vec384x ret, const vec384x a)
+{   sqr_mont_384x(ret, a, BLS12_381_P, p0);   }
+
+static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag)
+{
+    cneg_mod_384(ret[0], a[0], flag, BLS12_381_P);
+    cneg_mod_384(ret[1], a[1], flag, BLS12_381_P);
+}
+
+#define vec_load_global vec_copy
+
+static void reciprocal_fp(vec384 out, const vec384 inp);
+static void flt_reciprocal_fp(vec384 out, const vec384 inp);
+static bool_t recip_sqrt_fp(vec384 out, const vec384 inp);
+static bool_t sqrt_fp(vec384 out, const vec384 inp);
+
+static void reciprocal_fp2(vec384x out, const vec384x inp);
+static void flt_reciprocal_fp2(vec384x out, const vec384x inp);
+static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp,
+                             const vec384x recip_ZZZ, const vec384x magic_ZZZ);
+static bool_t sqrt_fp2(vec384x out, const vec384x inp);
+static bool_t sqrt_align_fp2(vec384x out, const vec384x ret,
+                             const vec384x sqrt, const vec384x inp);
+
+typedef vec384x   vec384fp2;
+typedef vec384fp2 vec384fp6[3];
+typedef vec384fp6 vec384fp12[2];
+
+static void sqr_fp12(vec384fp12 ret, const vec384fp12 a);
+static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a);
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b);
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0);
+static void conjugate_fp12(vec384fp12 a);
+static void inverse_fp12(vec384fp12 ret, const vec384fp12 a);
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n);
+
+#define neg_fp(r,a) cneg_fp((r),(a),1)
+#define neg_fp2(r,a) cneg_fp2((r),(a),1)
+
+#endif /* __BLS12_381_ASM_FIELDS_H__ */
diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c
new file mode 100644
index 00000000000..ab247a8ebf0
--- /dev/null
+++ b/crypto/blst_src/fp12_tower.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+/*
+ * Fp2  = Fp[u]  / (u^2 + 1)
+ * Fp6  = Fp2[v] / (v^3 - u - 1)
+ * Fp12 = Fp6[w] / (w^2 - v)
+ */
+
+static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a)
+{   mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P);   }
+
+#if 1 && !defined(__BLST_NO_ASM__)
+#define __FP2x2__
+/*
+ * Fp2x2 is a "widened" version of Fp2, which allows to consolidate
+ * reductions from several multiplications. In other words instead of
+ * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter
+ * addition is double-width... To be more specific this gives ~7-10%
+ * faster pairing depending on platform...
+ */
+typedef vec768 vec768x[2];
+
+static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b)
+{
+    add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P);
+    add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P);
+}
+
+static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b)
+{
+    sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P);
+    sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P);
+}
+
+static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a)
+{
+    /* caveat lector! |ret| may not be same as |a| */
+    sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P);
+    add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P);
+}
+
+static inline void redc_fp2x2(vec384x ret, const vec768x a)
+{
+    redc_mont_384(ret[0], a[0], BLS12_381_P, p0);
+    redc_mont_384(ret[1], a[1], BLS12_381_P, p0);
+}
+
+static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b)
+{
+#if 1
+    mul_382x(ret, a, b, BLS12_381_P);   /* +~6% in Miller loop */
+#else
+    union { vec384 x[2]; vec768 x2; } t;
+
+    add_mod_384(t.x[0], a[0], a[1], BLS12_381_P);
+    add_mod_384(t.x[1], b[0], b[1], BLS12_381_P);
+    mul_384(ret[1], t.x[0], t.x[1]);
+
+    mul_384(ret[0], a[0], b[0]);
+    mul_384(t.x2,   a[1], b[1]);
+
+    sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P);
+    sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P);
+
+    sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P);
+#endif
+}
+
+static void sqr_fp2x2(vec768x ret, const vec384x a)
+{
+#if 1
+    sqr_382x(ret, a, BLS12_381_P);      /* +~5% in final exponentiation */
+#else
+    vec384 t0, t1;
+
+    add_mod_384(t0, a[0], a[1], BLS12_381_P);
+    sub_mod_384(t1, a[0], a[1], BLS12_381_P);
+
+    mul_384(ret[1], a[0], a[1]);
+    add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P);
+
+    mul_384(ret[0], t0, t1);
+#endif
+}
+#endif  /* __FP2x2__ */
+
+/*
+ * Fp6 extension
+ */
+#if defined(__FP2x2__)  /* ~10-13% improvement for mul_fp12 and sqr_fp12 */
+typedef vec768x vec768fp6[3];
+
+static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a,
+                                            const vec768fp6 b)
+{
+    sub_fp2x2(ret[0], a[0], b[0]);
+    sub_fp2x2(ret[1], a[1], b[1]);
+    sub_fp2x2(ret[2], a[2], b[2]);
+}
+
+static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec768x t0, t1, t2;
+    vec384x aa, bb;
+
+    mul_fp2x2(t0, a[0], b[0]);
+    mul_fp2x2(t1, a[1], b[1]);
+    mul_fp2x2(t2, a[2], b[2]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0
+              = (a1*b2 + a2*b1)*(u+1) + a0*b0 */
+    add_fp2(aa, a[1], a[2]);
+    add_fp2(bb, b[1], b[2]);
+    mul_fp2x2(ret[0], aa, bb);
+    sub_fp2x2(ret[0], ret[0], t1);
+    sub_fp2x2(ret[0], ret[0], t2);
+    mul_by_u_plus_1_fp2x2(ret[1], ret[0]);  /* borrow ret[1] for a moment */
+    add_fp2x2(ret[0], ret[1], t0);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1)
+              = a0*b1 + a1*b0 + a2*b2*(u+1) */
+    add_fp2(aa, a[0], a[1]);
+    add_fp2(bb, b[0], b[1]);
+    mul_fp2x2(ret[1], aa, bb);
+    sub_fp2x2(ret[1], ret[1], t0);
+    sub_fp2x2(ret[1], ret[1], t1);
+    mul_by_u_plus_1_fp2x2(ret[2], t2);      /* borrow ret[2] for a moment */
+    add_fp2x2(ret[1], ret[1], ret[2]);
+
+    /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1
+              = a0*b2 + a2*b0 + a1*b1 */
+    add_fp2(aa, a[0], a[2]);
+    add_fp2(bb, b[0], b[2]);
+    mul_fp2x2(ret[2], aa, bb);
+    sub_fp2x2(ret[2], ret[2], t0);
+    sub_fp2x2(ret[2], ret[2], t2);
+    add_fp2x2(ret[2], ret[2], t1);
+}
+
+static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a)
+{
+    redc_fp2x2(ret[0], a[0]);
+    redc_fp2x2(ret[1], a[1]);
+    redc_fp2x2(ret[2], a[2]);
+}
+
+static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec768fp6 r;
+
+    mul_fp6x2(r, a, b);
+    redc_fp6x2(ret, r); /* narrow to normal width */
+}
+
+static void sqr_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec768x s0, m01, m12, s2, rx;
+
+    sqr_fp2x2(s0, a[0]);
+
+    mul_fp2x2(m01, a[0], a[1]);
+    add_fp2x2(m01, m01, m01);
+
+    mul_fp2x2(m12, a[1], a[2]);
+    add_fp2x2(m12, m12, m12);
+
+    sqr_fp2x2(s2, a[2]);
+
+    /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2)
+              = a1^2 + 2*(a0*a2) */
+    add_fp2(ret[2], a[2], a[1]);
+    add_fp2(ret[2], ret[2], a[0]);
+    sqr_fp2x2(rx, ret[2]);
+    sub_fp2x2(rx, rx, s0);
+    sub_fp2x2(rx, rx, s2);
+    sub_fp2x2(rx, rx, m01);
+    sub_fp2x2(rx, rx, m12);
+    redc_fp2x2(ret[2], rx);
+
+    /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */
+    mul_by_u_plus_1_fp2x2(rx, m12);
+    add_fp2x2(rx, rx, s0);
+    redc_fp2x2(ret[0], rx);
+
+    /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */
+    mul_by_u_plus_1_fp2x2(rx, s2);
+    add_fp2x2(rx, rx, m01);
+    redc_fp2x2(ret[1], rx);
+}
+#else
+static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec384x t0, t1, t2, t3, t4, t5;
+
+    mul_fp2(t0, a[0], b[0]);
+    mul_fp2(t1, a[1], b[1]);
+    mul_fp2(t2, a[2], b[2]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0
+              = (a1*b2 + a2*b1)*(u+1) + a0*b0 */
+    add_fp2(t4, a[1], a[2]);
+    add_fp2(t5, b[1], b[2]);
+    mul_fp2(t3, t4, t5);
+    sub_fp2(t3, t3, t1);
+    sub_fp2(t3, t3, t2);
+    mul_by_u_plus_1_fp2(t3, t3);
+    /* add_fp2(ret[0], t3, t0); considering possible aliasing... */
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1)
+              = a0*b1 + a1*b0 + a2*b2*(u+1) */
+    add_fp2(t4, a[0], a[1]);
+    add_fp2(t5, b[0], b[1]);
+    mul_fp2(ret[1], t4, t5);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+    mul_by_u_plus_1_fp2(t4, t2);
+    add_fp2(ret[1], ret[1], t4);
+
+    /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1
+              = a0*b2 + a2*b0 + a1*b1 */
+    add_fp2(t4, a[0], a[2]);
+    add_fp2(t5, b[0], b[2]);
+    mul_fp2(ret[2], t4, t5);
+    sub_fp2(ret[2], ret[2], t0);
+    sub_fp2(ret[2], ret[2], t2);
+    add_fp2(ret[2], ret[2], t1);
+
+    add_fp2(ret[0], t3, t0);    /* ... moved from above */
+}
+
+static void sqr_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x s0, m01, m12, s2;
+
+    sqr_fp2(s0, a[0]);
+
+    mul_fp2(m01, a[0], a[1]);
+    add_fp2(m01, m01, m01);
+
+    mul_fp2(m12, a[1], a[2]);
+    add_fp2(m12, m12, m12);
+
+    sqr_fp2(s2, a[2]);
+
+    /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2)
+              = a1^2 + 2*(a0*a2) */
+    add_fp2(ret[2], a[2], a[1]);
+    add_fp2(ret[2], ret[2], a[0]);
+    sqr_fp2(ret[2], ret[2]);
+    sub_fp2(ret[2], ret[2], s0);
+    sub_fp2(ret[2], ret[2], s2);
+    sub_fp2(ret[2], ret[2], m01);
+    sub_fp2(ret[2], ret[2], m12);
+
+    /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */
+    mul_by_u_plus_1_fp2(ret[0], m12);
+    add_fp2(ret[0], ret[0], s0);
+
+    /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */
+    mul_by_u_plus_1_fp2(ret[1], s2);
+    add_fp2(ret[1], ret[1], m01);
+}
+#endif
+
+static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    add_fp2(ret[0], a[0], b[0]);
+    add_fp2(ret[1], a[1], b[1]);
+    add_fp2(ret[2], a[2], b[2]);
+}
+
+static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    sub_fp2(ret[0], a[0], b[0]);
+    sub_fp2(ret[1], a[1], b[1]);
+    sub_fp2(ret[2], a[2], b[2]);
+}
+
+static void neg_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    neg_fp2(ret[0], a[0]);
+    neg_fp2(ret[1], a[1]);
+    neg_fp2(ret[2], a[2]);
+}
+
+#if 0
+#define mul_by_v_fp6 mul_by_v_fp6
+static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x t;
+
+    mul_by_u_plus_1_fp2(t, a[2]);
+    vec_copy(ret[2], a[1], sizeof(a[1]));
+    vec_copy(ret[1], a[0], sizeof(a[0]));
+    vec_copy(ret[0], t, sizeof(t));
+}
+#endif
+
+/*
+ * Fp12 extension
+ */
+#if defined(__FP2x2__)
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{
+    vec768fp6 t0, t1, rx;
+    vec384fp6 t2;
+
+    mul_fp6x2(t0, a[0], b[0]);
+    mul_fp6x2(t1, a[1], b[1]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    add_fp6(t2, a[0], a[1]);
+    add_fp6(ret[1], b[0], b[1]);
+    mul_fp6x2(rx, ret[1], t2);
+    sub_fp6x2(rx, rx, t0);
+    sub_fp6x2(rx, rx, t1);
+    redc_fp6x2(ret[1], rx);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+    mul_by_u_plus_1_fp2x2(rx[0], t1[2]);
+    add_fp2x2(rx[0], t0[0], rx[0]);
+    add_fp2x2(rx[1], t0[1], t1[0]);
+    add_fp2x2(rx[2], t0[2], t1[1]);
+    redc_fp6x2(ret[0], rx);
+}
+
+static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a,
+                                                   const vec384fp2 b)
+{
+    mul_fp2x2(ret[1], a[2], b);     /* borrow ret[1] for a moment */
+    mul_by_u_plus_1_fp2x2(ret[0], ret[1]);
+    mul_fp2x2(ret[1], a[0], b);
+    mul_fp2x2(ret[2], a[1], b);
+}
+
+static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a,
+                                            const vec384fp6 b)
+{
+    vec768x t0, t1;
+    vec384x aa, bb;
+
+    mul_fp2x2(t0, a[0], b[0]);
+    mul_fp2x2(t1, a[1], b[1]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0
+              = (a1*0 + a2*b1)*(u+1) + a0*b0 */
+    mul_fp2x2(ret[1], a[2], b[1]);  /* borrow ret[1] for a moment */
+    mul_by_u_plus_1_fp2x2(ret[0], ret[1]);
+    add_fp2x2(ret[0], ret[0], t0);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1)
+              = a0*b1 + a1*b0 + a2*0*(u+1) */
+    add_fp2(aa, a[0], a[1]);
+    add_fp2(bb, b[0], b[1]);
+    mul_fp2x2(ret[1], aa, bb);
+    sub_fp2x2(ret[1], ret[1], t0);
+    sub_fp2x2(ret[1], ret[1], t1);
+
+    /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1
+              = a0*0 + a2*b0 + a1*b1 */
+    mul_fp2x2(ret[2], a[2], b[0]);
+    add_fp2x2(ret[2], ret[2], t1);
+}
+
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0)
+{
+    vec768fp6 t0, t1, rr;
+    vec384fp6 t2;
+
+    mul_by_xy0_fp6x2(t0, a[0], xy00z0);
+    mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    vec_copy(t2[0], xy00z0[0], sizeof(t2[0]));
+    add_fp2(t2[1], xy00z0[1], xy00z0[2]);
+    add_fp6(ret[1], a[0], a[1]);
+    mul_by_xy0_fp6x2(rr, ret[1], t2);
+    sub_fp6x2(rr, rr, t0);
+    sub_fp6x2(rr, rr, t1);
+    redc_fp6x2(ret[1], rr);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+    mul_by_u_plus_1_fp2x2(rr[0], t1[2]);
+    add_fp2x2(rr[0], t0[0], rr[0]);
+    add_fp2x2(rr[1], t0[1], t1[0]);
+    add_fp2x2(rr[2], t0[2], t1[1]);
+    redc_fp6x2(ret[0], rr);
+}
+#else
+static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{
+    vec384fp6 t0, t1, t2;
+
+    mul_fp6(t0, a[0], b[0]);
+    mul_fp6(t1, a[1], b[1]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    add_fp6(t2, a[0], a[1]);
+    add_fp6(ret[1], b[0], b[1]);
+    mul_fp6(ret[1], ret[1], t2);
+    sub_fp6(ret[1], ret[1], t0);
+    sub_fp6(ret[1], ret[1], t1);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    add_fp6(ret[0], t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    add_fp2(ret[0][0], t0[0], t1[2]);
+    add_fp2(ret[0][1], t0[1], t1[0]);
+    add_fp2(ret[0][2], t0[2], t1[1]);
+#endif
+}
+
+static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a,
+                                                 const vec384fp2 b)
+{
+    vec384x t;
+
+    mul_fp2(t,      a[2], b);
+    mul_fp2(ret[2], a[1], b);
+    mul_fp2(ret[1], a[0], b);
+    mul_by_u_plus_1_fp2(ret[0], t);
+}
+
+static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b)
+{
+    vec384x t0, t1, /*t2,*/ t3, t4, t5;
+
+    mul_fp2(t0, a[0], b[0]);
+    mul_fp2(t1, a[1], b[1]);
+
+    /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0
+              = (a1*0 + a2*b1)*(u+1) + a0*b0 */
+    mul_fp2(t3, a[2], b[1]);
+    mul_by_u_plus_1_fp2(t3, t3);
+    /* add_fp2(ret[0], t3, t0); considering possible aliasing... */
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1)
+              = a0*b1 + a1*b0 + a2*0*(u+1) */
+    add_fp2(t4, a[0], a[1]);
+    add_fp2(t5, b[0], b[1]);
+    mul_fp2(ret[1], t4, t5);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+
+    /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1
+              = a0*0 + a2*b0 + a1*b1 */
+    mul_fp2(ret[2], a[2], b[0]);
+    add_fp2(ret[2], ret[2], t1);
+
+    add_fp2(ret[0], t3, t0);    /* ... moved from above */
+}
+
+static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a,
+                                               const vec384fp6 xy00z0)
+{
+    vec384fp6 t0, t1, t2;
+
+    mul_by_xy0_fp6(t0, a[0], xy00z0);
+    mul_by_0y0_fp6(t1, a[1], xy00z0[2]);
+
+    /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1
+              = a0*b1 + a1*b0 */
+    vec_copy(t2[0], xy00z0[0], sizeof(t2[0]));
+    add_fp2(t2[1], xy00z0[1], xy00z0[2]);
+    add_fp6(ret[1], a[0], a[1]);
+    mul_by_xy0_fp6(ret[1], ret[1], t2);
+    sub_fp6(ret[1], ret[1], t0);
+    sub_fp6(ret[1], ret[1], t1);
+
+    /* ret[0] = a0*b0 + a1*b1*v */
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    add_fp6(ret[0], t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    add_fp2(ret[0][0], t0[0], t1[2]);
+    add_fp2(ret[0][1], t0[1], t1[0]);
+    add_fp2(ret[0][2], t0[2], t1[1]);
+#endif
+}
+#endif
+
+static void sqr_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp6 t0, t1;
+
+    add_fp6(t0, a[0], a[1]);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, a[1]);
+    add_fp6(t1, a[0], t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], a[1][2]);
+    add_fp2(t1[0], a[0][0], t1[2]);
+    add_fp2(t1[1], a[0][1], a[1][0]);
+    add_fp2(t1[2], a[0][2], a[1][1]);
+#endif
+    mul_fp6(t0, t0, t1);
+    mul_fp6(t1, a[0], a[1]);
+
+    /* ret[1] = 2*(a0*a1) */
+    add_fp6(ret[1], t1, t1);
+
+    /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v
+              = a0^2 + a1^2*v */
+    sub_fp6(ret[0], t0, t1);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    sub_fp6(ret[0], ret[0], t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    sub_fp2(ret[0][0], ret[0][0], t1[2]);
+    sub_fp2(ret[0][1], ret[0][1], t1[0]);
+    sub_fp2(ret[0][2], ret[0][2], t1[1]);
+#endif
+}
+
+static void conjugate_fp12(vec384fp12 a)
+{   neg_fp6(a[1], a[1]);   }
+
+static void inverse_fp6(vec384fp6 ret, const vec384fp6 a)
+{
+    vec384x c0, c1, c2, t0, t1;
+
+    /* c0 = a0^2 - (a1*a2)*(u+1) */
+    sqr_fp2(c0, a[0]);
+    mul_fp2(t0, a[1], a[2]);
+    mul_by_u_plus_1_fp2(t0, t0);
+    sub_fp2(c0, c0, t0);
+
+    /* c1 = a2^2*(u+1) - (a0*a1) */
+    sqr_fp2(c1, a[2]);
+    mul_by_u_plus_1_fp2(c1, c1);
+    mul_fp2(t0, a[0], a[1]);
+    sub_fp2(c1, c1, t0);
+ 
+    /* c2 = a1^2 - a0*a2 */
+    sqr_fp2(c2, a[1]);
+    mul_fp2(t0, a[0], a[2]);
+    sub_fp2(c2, c2, t0);
+
+    /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */
+    mul_fp2(t0, c1, a[2]);
+    mul_fp2(t1, c2, a[1]);
+    add_fp2(t0, t0, t1);
+    mul_by_u_plus_1_fp2(t0, t0);
+    mul_fp2(t1, c0, a[0]);
+    add_fp2(t0, t0, t1);
+
+    reciprocal_fp2(t1, t0);
+
+    mul_fp2(ret[0], c0, t1);
+    mul_fp2(ret[1], c1, t1);
+    mul_fp2(ret[2], c2, t1);
+}
+
+static void inverse_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp6 t0, t1;
+
+    sqr_fp6(t0, a[0]);
+    sqr_fp6(t1, a[1]);
+#ifdef mul_by_v_fp6
+    mul_by_v_fp6(t1, t1);
+    sub_fp6(t0, t0, t1);
+#else
+    mul_by_u_plus_1_fp2(t1[2], t1[2]);
+    sub_fp2(t0[0], t0[0], t1[2]);
+    sub_fp2(t0[1], t0[1], t1[0]);
+    sub_fp2(t0[2], t0[2], t1[1]);
+#endif
+
+    inverse_fp6(t1, t0);
+
+    mul_fp6(ret[0], a[0], t1);
+    mul_fp6(ret[1], a[1], t1);
+    neg_fp6(ret[1], ret[1]);
+}
+
+typedef vec384x vec384fp4[2];
+
+#if defined(__FP2x2__)
+static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1)
+{
+    vec768x t0, t1, t2;
+
+    sqr_fp2x2(t0, a0);
+    sqr_fp2x2(t1, a1);
+    add_fp2(ret[1], a0, a1);
+
+    mul_by_u_plus_1_fp2x2(t2, t1);
+    add_fp2x2(t2, t2, t0);
+    redc_fp2x2(ret[0], t2);
+
+    sqr_fp2x2(t2, ret[1]);
+    sub_fp2x2(t2, t2, t0);
+    sub_fp2x2(t2, t2, t1);
+    redc_fp2x2(ret[1], t2);
+}
+#else
+static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1)
+{
+    vec384x t0, t1;
+
+    sqr_fp2(t0, a0);
+    sqr_fp2(t1, a1);
+    add_fp2(ret[1], a0, a1);
+
+    mul_by_u_plus_1_fp2(ret[0], t1);
+    add_fp2(ret[0], ret[0], t0);
+
+    sqr_fp2(ret[1], ret[1]);
+    sub_fp2(ret[1], ret[1], t0);
+    sub_fp2(ret[1], ret[1], t1);
+}
+#endif
+
+static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a)
+{
+    vec384fp4 t0, t1, t2;
+
+    sqr_fp4(t0, a[0][0], a[1][1]);
+    sqr_fp4(t1, a[1][0], a[0][2]);
+    sqr_fp4(t2, a[0][1], a[1][2]);
+
+    sub_fp2(ret[0][0], t0[0],     a[0][0]);
+    add_fp2(ret[0][0], ret[0][0], ret[0][0]);
+    add_fp2(ret[0][0], ret[0][0], t0[0]);
+
+    sub_fp2(ret[0][1], t1[0],     a[0][1]);
+    add_fp2(ret[0][1], ret[0][1], ret[0][1]);
+    add_fp2(ret[0][1], ret[0][1], t1[0]);
+
+    sub_fp2(ret[0][2], t2[0],     a[0][2]);
+    add_fp2(ret[0][2], ret[0][2], ret[0][2]);
+    add_fp2(ret[0][2], ret[0][2], t2[0]);
+
+    mul_by_u_plus_1_fp2(t2[1], t2[1]);
+    add_fp2(ret[1][0], t2[1],     a[1][0]);
+    add_fp2(ret[1][0], ret[1][0], ret[1][0]);
+    add_fp2(ret[1][0], ret[1][0], t2[1]);
+
+    add_fp2(ret[1][1], t0[1],     a[1][1]);
+    add_fp2(ret[1][1], ret[1][1], ret[1][1]);
+    add_fp2(ret[1][1], ret[1][1], t0[1]);
+
+    add_fp2(ret[1][2], t1[1],     a[1][2]);
+    add_fp2(ret[1][2], ret[1][2], ret[1][2]);
+    add_fp2(ret[1][2], ret[1][2], t1[1]);
+}
+
+/*
+ * caveat lector! |n| has to be non-zero and not more than 3!
+ */
+static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n)
+{
+    vec_copy(ret[0], a[0], sizeof(ret[0]));
+    cneg_fp(ret[1], a[1], n & 1);
+}
+
+static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n)
+{
+    static const vec384x coeffs1[] = {  /* (u + 1)^((P^n - 1) / 3) */
+      { { 0 },
+        { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+          TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+          TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } },
+      { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a),
+          TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b),
+          TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } },
+      { { 0 }, { ONE_MONT_P } }
+    };
+    static const vec384 coeffs2[] = {  /* (u + 1)^((2P^n - 2) / 3) */
+      {   TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5),
+          TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024),
+          TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a)   },
+      {   TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2),
+          TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e),
+          TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741)   },
+      {   TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd),
+          TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a),
+          TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206)   }
+    };
+
+    frobenius_map_fp2(ret[0], a[0], n);
+    frobenius_map_fp2(ret[1], a[1], n);
+    frobenius_map_fp2(ret[2], a[2], n);
+    --n;    /* implied ONE_MONT_P at index 0 */
+    mul_fp2(ret[1], ret[1], coeffs1[n]);
+    mul_fp(ret[2][0], ret[2][0], coeffs2[n]);
+    mul_fp(ret[2][1], ret[2][1], coeffs2[n]);
+}
+
+static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n)
+{
+    static const vec384x coeffs[] = {  /* (u + 1)^((P^n - 1) / 6) */
+      { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313),
+          TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee),
+          TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) },
+	{ TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec),
+          TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0),
+          TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } },
+      { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c),
+          TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721),
+          TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } },
+      { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+          TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+          TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+	{ TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+          TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+          TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } },
+    };
+
+    frobenius_map_fp6(ret[0], a[0], n);
+    frobenius_map_fp6(ret[1], a[1], n);
+    --n;    /* implied ONE_MONT_P at index 0 */
+    mul_fp2(ret[1][0], ret[1][0], coeffs[n]);
+    mul_fp2(ret[1][1], ret[1][1], coeffs[n]);
+    mul_fp2(ret[1][2], ret[1][2], coeffs[n]);
+}
+
+
+/*
+ * BLS12-381-specifc Fp12 shortcuts.
+ */
+void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a)
+{   sqr_fp12(ret, a);   }
+
+void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a)
+{   cyclotomic_sqr_fp12(ret, a);   }
+
+void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b)
+{   mul_fp12(ret, a, b);   }
+
+void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a,
+                                             const vec384fp6 xy00z0)
+{   mul_by_xy00z0_fp12(ret, a, xy00z0);   }
+
+void blst_fp12_conjugate(vec384fp12 a)
+{   conjugate_fp12(a);   }
+
+void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a)
+{   inverse_fp12(ret, a);   }
+
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n)
+{   frobenius_map_fp12(ret, a, n);   }
+
+int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b)
+{   return (int)vec_is_equal(a, b, sizeof(vec384fp12));   }
+
+int blst_fp12_is_one(const vec384fp12 a)
+{
+    return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) &
+                 vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0])));
+}
+
+const vec384fp12 *blst_fp12_one(void)
+{   return (const vec384fp12 *)BLS12_381_Rx.p12;   }
+
+void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a)
+{
+    size_t i, j;
+    vec384 out;
+
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 2; j++) {
+            from_fp(out, a[j][i][0]);
+            be_bytes_from_limbs(ret, out, sizeof(vec384));  ret += 48;
+            from_fp(out, a[j][i][1]);
+            be_bytes_from_limbs(ret, out, sizeof(vec384));  ret += 48;
+        }
+    }
+}
+
+size_t blst_fp12_sizeof(void)
+{   return sizeof(vec384fp12);   }
diff --git a/crypto/blst_src/hash_to_field.c b/crypto/blst_src/hash_to_field.c
new file mode 100644
index 00000000000..6816ea8b922
--- /dev/null
+++ b/crypto/blst_src/hash_to_field.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+#include "sha256.h"
+
+static const vec384 BLS12_381_RRRR = {  /* RR^2 */
+    TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8),
+    TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761),
+    TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d)
+};
+
+#ifdef expand_message_xmd
+void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                        const unsigned char *aug, size_t aug_len,
+                        const unsigned char *msg, size_t msg_len,
+                        const unsigned char *DST, size_t DST_len);
+#else
+static void sha256_init_Zpad(SHA256_CTX *ctx)
+{
+    ctx->h[0] = 0xda5698beU;
+    ctx->h[1] = 0x17b9b469U;
+    ctx->h[2] = 0x62335799U;
+    ctx->h[3] = 0x779fbecaU;
+    ctx->h[4] = 0x8ce5d491U;
+    ctx->h[5] = 0xc0d26243U;
+    ctx->h[6] = 0xbafef9eaU;
+    ctx->h[7] = 0x1837a9d8U;
+    ctx->N = 64;
+    vec_zero(ctx->buf, sizeof(ctx->buf));
+    ctx->off = 0;
+}
+
+static void vec_xor(void *restrict ret, const void *restrict a,
+                                        const void *restrict b, size_t num)
+{
+    limb_t *rp = (limb_t *)ret;
+    const limb_t *ap = (const limb_t *)a;
+    const limb_t *bp = (const limb_t *)b;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = ap[i] ^ bp[i];
+}
+
+static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                               const unsigned char *aug, size_t aug_len,
+                               const unsigned char *msg, size_t msg_len,
+                               const unsigned char *DST, size_t DST_len)
+{
+    union { limb_t align; unsigned char c[32]; } b_0;
+    union { limb_t align; unsigned char c[33+256+31]; } b_i;
+    unsigned char *p;
+    size_t i, b_i_bits, b_i_blocks;
+    SHA256_CTX ctx;
+
+    /*
+     * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime'
+     */
+    if (DST_len > 255) {
+        sha256_init(&ctx);
+        sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17);
+        sha256_update(&ctx, DST, DST_len);
+        sha256_final(b_0.c, &ctx);
+        DST = b_0.c, DST_len = 32;
+    }
+    b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64;
+    vec_zero(b_i.c + b_i_blocks - 64, 64);
+
+    p = b_i.c + 33;
+    for (i = 0; i < DST_len; i++)
+        p[i] = DST[i];
+    p[i++] = (unsigned char)DST_len;
+    p[i++] = 0x80;
+    p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0;
+    b_i_bits = (33 + DST_len + 1) * 8;
+    p = b_i.c + b_i_blocks;
+    p[-2] = (unsigned char)(b_i_bits >> 8);
+    p[-1] = (unsigned char)(b_i_bits);
+
+    sha256_init_Zpad(&ctx);                         /* Z_pad | */
+    sha256_update(&ctx, aug, aug_len);              /* | aug | */
+    sha256_update(&ctx, msg, msg_len);              /* | msg | */
+    /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime    */
+    b_i.c[30] = (unsigned char)(len_in_bytes >> 8);
+    b_i.c[31] = (unsigned char)(len_in_bytes);
+    b_i.c[32] = 0;
+    sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1);
+    sha256_final(b_0.c, &ctx);
+
+    sha256_init_h(ctx.h);
+    vec_copy(b_i.c, b_0.c, 32);
+    ++b_i.c[32];
+    sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64);
+    sha256_emit(bytes, ctx.h);
+
+    len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */
+    len_in_bytes /= 32; /* caller being responsible for accordingly large
+                         * buffer. hash_to_field passes one with length
+                         * divisible by 64, remember? which works... */
+    while (--len_in_bytes) {
+        sha256_init_h(ctx.h);
+        vec_xor(b_i.c, b_0.c, bytes, 32);
+        bytes += 32;
+        ++b_i.c[32];
+        sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64);
+        sha256_emit(bytes, ctx.h);
+    }
+}
+#endif
+
+/*
+ * |nelems| is 'count * m' from spec
+ */
+static void hash_to_field(vec384 elems[], size_t nelems,
+                          const unsigned char *aug, size_t aug_len,
+                          const unsigned char *msg, size_t msg_len,
+                          const unsigned char *DST, size_t DST_len)
+{
+    size_t L = sizeof(vec384) + 128/8;  /* ceil((ceil(log2(p)) + k) / 8) */
+    size_t len_in_bytes = L * nelems;   /* divisible by 64, hurray!      */
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \
+                               || defined(__STDC_NO_VLA__)
+    limb_t *pseudo_random = alloca(len_in_bytes);
+#else
+    limb_t pseudo_random[len_in_bytes/sizeof(limb_t)];
+#endif
+    unsigned char *bytes;
+    vec768 elem;
+
+    aug_len = aug!=NULL ? aug_len : 0;
+    DST_len = DST!=NULL ? DST_len : 0;
+
+    expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes,
+                       aug, aug_len, msg, msg_len, DST, DST_len);
+
+    vec_zero(elem, sizeof(elem));
+    bytes = (unsigned char *)pseudo_random;
+    while (nelems--) {
+        limbs_from_be_bytes(elem, bytes, L);
+        bytes += L;
+        /*
+         * L-bytes block % P, output is in Montgomery domain...
+         */
+        redc_mont_384(elems[0], elem, BLS12_381_P, p0);
+        mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0);
+        elems++;
+    }
+}
+
+void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                             const unsigned char *msg, size_t msg_len,
+                             const unsigned char *DST, size_t DST_len)
+{
+    size_t buf_len = (len_in_bytes+31) & ((size_t)0-32);
+    unsigned char *buf_ptr = bytes;
+
+    if (buf_len > 255*32)
+        return;
+
+    if (buf_len != len_in_bytes)
+        buf_ptr = alloca(buf_len);
+
+    expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len,
+                                              DST, DST_len);
+    if (buf_ptr != bytes) {
+        unsigned char *ptr = buf_ptr;
+        while (len_in_bytes--)
+            *bytes++ = *ptr++;
+        vec_zero(buf_ptr, buf_len);
+    }
+}
diff --git a/crypto/blst_src/keygen.c b/crypto/blst_src/keygen.c
new file mode 100644
index 00000000000..9b62f16b534
--- /dev/null
+++ b/crypto/blst_src/keygen.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "consts.h"
+#include "bytes.h"
+#include "sha256.h"
+
+typedef struct {
+    SHA256_CTX ctx;
+    unsigned int h_ipad[8];
+    unsigned int h_opad[8];
+    union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail;
+} HMAC_SHA256_CTX;
+
+static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len)
+{
+    size_t i;
+
+    if (K == NULL) {            /* reuse h_ipad and h_opad */
+        sha256_hcopy(ctx->ctx.h, ctx->h_ipad);
+        ctx->ctx.N = 64;
+        vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf));
+        ctx->ctx.off = 0;
+
+        return;
+    }
+
+    vec_zero(ctx->tail.c, sizeof(ctx->tail));
+    if (K_len > 64) {
+        sha256_init(&ctx->ctx);
+        sha256_update(&ctx->ctx, K, K_len);
+        sha256_final(ctx->tail.c, &ctx->ctx);
+    } else {
+        sha256_bcopy(ctx->tail.c, K, K_len);
+    }
+
+    for (i = 0; i < 64/sizeof(limb_t); i++)
+        ctx->tail.l[i] ^= (limb_t)0x3636363636363636;
+
+    sha256_init(&ctx->ctx);
+    sha256_update(&ctx->ctx, ctx->tail.c, 64);
+    sha256_hcopy(ctx->h_ipad, ctx->ctx.h);
+
+    for (i = 0; i < 64/sizeof(limb_t); i++)
+        ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c);
+
+    sha256_init_h(ctx->h_opad);
+    sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1);
+
+    vec_zero(ctx->tail.c, sizeof(ctx->tail));
+    ctx->tail.c[32] = 0x80;
+    ctx->tail.c[62] = 3;        /* (64+32)*8 in big endian */
+    ctx->tail.c[63] = 0;
+}
+
+static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp,
+                                              size_t len)
+{   sha256_update(&ctx->ctx, inp, len);   }
+
+static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx)
+{
+    sha256_final(ctx->tail.c, &ctx->ctx);
+    sha256_hcopy(ctx->ctx.h, ctx->h_opad);
+    sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1);
+    sha256_emit(md, ctx->ctx.h);
+}
+
+static void HKDF_Extract(unsigned char PRK[32],
+                         const void *salt, size_t salt_len,
+                         const void *IKM,  size_t IKM_len,
+#ifndef __BLST_HKDF_TESTMODE__
+                         int IKM_fixup,
+#endif
+                         HMAC_SHA256_CTX *ctx)
+{
+    unsigned char zero[1] = { 0 };
+
+    HMAC_init(ctx, salt != NULL ? salt : zero, salt_len);
+    HMAC_update(ctx, IKM, IKM_len);
+#ifndef __BLST_HKDF_TESTMODE__
+    if (IKM_fixup) {
+        /* Section 2.3 KeyGen in BLS-signature draft */
+        HMAC_update(ctx, zero, 1);
+    }
+#endif
+    HMAC_final(PRK, ctx);
+}
+
+static void HKDF_Expand(unsigned char *OKM, size_t L,
+                        const unsigned char PRK[32],
+                        const void *info, size_t info_len,
+#ifndef __BLST_HKDF_TESTMODE__
+                        int info_fixup,
+#endif
+                        HMAC_SHA256_CTX *ctx)
+{
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \
+                               || defined(__STDC_NO_VLA__)
+    unsigned char *info_prime = alloca(info_len + 2 + 1);
+#else
+    unsigned char info_prime[info_len + 2 + 1];
+#endif
+
+    HMAC_init(ctx, PRK, 32);
+
+    if (info_len != 0)
+        sha256_bcopy(info_prime, info, info_len);
+#ifndef __BLST_HKDF_TESTMODE__
+    if (info_fixup) {
+        /* Section 2.3 KeyGen in BLS-signature draft */
+        info_prime[info_len + 0] = (unsigned char)(L >> 8);
+        info_prime[info_len + 1] = (unsigned char)(L);
+        info_len += 2;
+    }
+#endif
+    info_prime[info_len] = 1;   /* counter */
+    HMAC_update(ctx, info_prime, info_len + 1);
+    HMAC_final(ctx->tail.c, ctx);
+    while (L > 32) {
+        sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c);
+        OKM += 32; L -= 32;
+        ++info_prime[info_len]; /* counter */
+        HMAC_init(ctx, NULL, 0);
+        HMAC_update(ctx, ctx->tail.c, 32);
+        HMAC_update(ctx, info_prime, info_len + 1);
+        HMAC_final(ctx->tail.c, ctx);
+    }
+    sha256_bcopy(OKM, ctx->tail.c, L);
+}
+
+#ifndef __BLST_HKDF_TESTMODE__
+static void keygen(pow256 SK, const void *IKM, size_t IKM_len,
+                              const void *salt, size_t salt_len,
+                              const void *info, size_t info_len,
+                              int version)
+{
+    struct {
+        HMAC_SHA256_CTX ctx;
+        unsigned char PRK[32], OKM[48];
+        vec512 key;
+    } scratch;
+    unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-";
+
+    if (IKM_len < 32 || (version > 4 && salt == NULL)) {
+        vec_zero(SK, sizeof(pow256));
+        return;
+    }
+
+    /*
+     * Vet |info| since some callers were caught to be sloppy, e.g.
+     * SWIG-4.0-generated Python wrapper...
+     */
+    info_len = info==NULL ? 0 : info_len;
+
+    if (salt == NULL) {
+        salt = salt_prime;
+        salt_len = 20;
+    }
+
+    if (version == 4) {
+        /* salt = H(salt) */
+        sha256_init(&scratch.ctx.ctx);
+        sha256_update(&scratch.ctx.ctx, salt, salt_len);
+        sha256_final(salt_prime, &scratch.ctx.ctx);
+        salt = salt_prime;
+        salt_len = sizeof(salt_prime);
+    }
+
+    while (1) {
+        /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */
+        HKDF_Extract(scratch.PRK, salt, salt_len,
+                                  IKM, IKM_len, 1, &scratch.ctx);
+
+        /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */
+        HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK,
+                    info, info_len, 1, &scratch.ctx);
+
+        /* SK = OS2IP(OKM) mod r */
+        vec_zero(scratch.key, sizeof(scratch.key));
+        limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM));
+        redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0);
+        /*
+         * Given that mul_mont_sparse_256 has special boundary conditions
+         * it's appropriate to mention that redc_mont_256 output is fully
+         * reduced at this point. Because we started with 384-bit input,
+         * one with most significant half smaller than the modulus.
+         */
+        mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR,
+                            BLS12_381_r, r0);
+
+        if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256)))
+            break;
+
+        /* salt = H(salt) */
+        sha256_init(&scratch.ctx.ctx);
+        sha256_update(&scratch.ctx.ctx, salt, salt_len);
+        sha256_final(salt_prime, &scratch.ctx.ctx);
+        salt = salt_prime;
+        salt_len = sizeof(salt_prime);
+    }
+
+    le_bytes_from_limbs(SK, scratch.key, sizeof(pow256));
+
+    /*
+     * scrub the stack just in case next callee inadvertently flashes
+     * a fragment across application boundary...
+     */
+    vec_zero(&scratch, sizeof(scratch));
+}
+
+void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len,
+                            const void *info, size_t info_len)
+{   keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4);   }
+
+void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len,
+                               const void *info, size_t info_len)
+{   keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3);   }
+
+void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len,
+                                 const void *salt, size_t salt_len,
+                                 const void *info, size_t info_len)
+{   keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4);   }
+
+void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len,
+                               const void *salt, size_t salt_len,
+                               const void *info, size_t info_len)
+{   keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5);   }
+
+/*
+ * https://eips.ethereum.org/EIPS/eip-2333
+ */
+void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len)
+{   keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4);   }
+
+static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK,
+                                    unsigned int index)
+{
+    size_t i;
+    struct {
+        HMAC_SHA256_CTX ctx;
+        SHA256_CTX ret;
+        unsigned char PRK[32], IKM[32];
+        unsigned char lamport[255][32];
+    } scratch;
+
+    /* salt = I2OSP(index, 4) */
+    unsigned char salt[4] = { (unsigned char)(index>>24),
+                              (unsigned char)(index>>16),
+                              (unsigned char)(index>>8),
+                              (unsigned char)(index) };
+
+    /* IKM = I2OSP(parent_SK, 32) */
+    for (i = 0; i < 32; i++)
+        scratch.IKM[i] = parent_SK[31-i];
+
+    /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */
+    HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0,
+                 &scratch.ctx);
+    HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport),
+                scratch.PRK, NULL, 0, 0, &scratch.ctx);
+
+    vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf));
+    scratch.ctx.ctx.buf[32] = 0x80;
+    scratch.ctx.ctx.buf[62] = 1;    /* 32*8 in big endian */
+    scratch.ctx.ctx.buf[63] = 0;
+    for (i = 0; i < 255; i++) {
+        /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */
+        sha256_init_h(scratch.ctx.ctx.h);
+        sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32);
+        sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1);
+        sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h);
+    }
+
+    /* compressed_lamport_PK = SHA256(lamport_PK) */
+    sha256_init(&scratch.ret);
+    sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport));
+
+    /* not_IKM = flip_bits(IKM) */
+    for (i = 0; i< 32; i++)
+        scratch.IKM[i] = ~scratch.IKM[i];
+
+    /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */
+    HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0,
+                 &scratch.ctx);
+    HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport),
+                scratch.PRK, NULL, 0, 0, &scratch.ctx);
+
+    vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf));
+    scratch.ctx.ctx.buf[32] = 0x80;
+    scratch.ctx.ctx.buf[62] = 1;
+    for (i = 0; i < 255; i++) {
+        /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */
+        sha256_init_h(scratch.ctx.ctx.h);
+        sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32);
+        sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1);
+        sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h);
+    }
+
+    /* compressed_lamport_PK = SHA256(lamport_PK) */
+    sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport));
+    sha256_final(PK, &scratch.ret);
+
+    /*
+     * scrub the stack just in case next callee inadvertently flashes
+     * a fragment across application boundary...
+     */
+    vec_zero(&scratch, sizeof(scratch));
+}
+
+void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK,
+                               unsigned int child_index)
+{
+    parent_SK_to_lamport_PK(SK, parent_SK, child_index);
+    keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4);
+}
+#endif
diff --git a/crypto/blst_src/map_to_g1.c b/crypto/blst_src/map_to_g1.c
new file mode 100644
index 00000000000..6613d68bb29
--- /dev/null
+++ b/crypto/blst_src/map_to_g1.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * y^2 = x^3 + A'*x + B', isogenous one
+ */
+static const vec384 Aprime_E1 = {
+    /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8
+          d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */
+    TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3),
+    TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c),
+    TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85)
+};
+static const vec384 Bprime_E1 = {
+    /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070
+          a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */
+    TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f),
+    TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571),
+    TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b)
+};
+
+static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[],
+                            const vec384 Zz_powers[], size_t n)
+{
+    while (n--)
+        mul_fp(map[n], isogeny_map[n], Zz_powers[n]);
+}
+
+static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n)
+{
+    while (n--) {
+        mul_fp(acc, acc, x);
+        add_fp(acc, acc, map[n]);
+    }
+}
+
+static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p)
+{
+    /*
+     * x = x_num / x_den, where
+     * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 +
+     *         ... + k_(1,0)
+     * ...
+     */
+    static const vec384 isogeny_map_x_num[] = { /*  (k_(1,*)<<384) % P  */
+      { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c),
+        TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304),
+        TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4)  },
+      { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa),
+        TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad),
+        TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad)  },
+      { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6),
+        TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7),
+        TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524)  },
+      { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021),
+        TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60),
+        TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb)  },
+      { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185),
+        TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c),
+        TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6)  },
+      { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd),
+        TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db),
+        TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929)  },
+      { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9),
+        TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f),
+        TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe)  },
+      { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81),
+        TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65),
+        TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028)  },
+      { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475),
+        TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb),
+        TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac)  },
+      { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2),
+        TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f),
+        TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375)  },
+      { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375),
+        TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb),
+        TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c)  },
+      { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5),
+        TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb),
+        TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d)  }
+    };
+    /* ...
+     * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0)
+     */
+    static const vec384 isogeny_map_x_den[] = { /*  (k_(2,*)<<384) % P  */
+      { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0),
+        TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e),
+        TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f)  },
+      { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a),
+        TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93),
+        TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce)  },
+      { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0),
+        TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd),
+        TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2)  },
+      { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb),
+        TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df),
+        TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c)  },
+      { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f),
+        TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42),
+        TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015)  },
+      { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4),
+        TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2),
+        TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb)  },
+      { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3),
+        TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4),
+        TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606)  },
+      { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0),
+        TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23),
+        TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122)  },
+      { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7),
+        TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c),
+        TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34)  },
+      { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0),
+        TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f),
+        TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8)  }
+    };
+    /*
+     * y = y' * y_num / y_den, where
+     * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 +
+     *         ... + k_(3,0)
+     * ...
+     */
+    static const vec384 isogeny_map_y_num[] = { /*  (k_(3,*)<<384) % P  */
+      { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767),
+        TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df),
+        TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310)  },
+      { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878),
+        TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8),
+        TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555)  },
+      { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d),
+        TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0),
+        TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905)  },
+      { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707),
+        TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53),
+        TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257)  },
+      { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd),
+        TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f),
+        TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d)  },
+      { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982),
+        TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7),
+        TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793)  },
+      { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19),
+        TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0),
+        TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f)  },
+      { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091),
+        TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f),
+        TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79)  },
+      { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233),
+        TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482),
+        TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393)  },
+      { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17),
+        TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46),
+        TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb)  },
+      { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f),
+        TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d),
+        TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5)  },
+      { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4),
+        TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d),
+        TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4)  },
+      { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5),
+        TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef),
+        TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2)  },
+      { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a),
+        TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f),
+        TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49)  },
+      { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f),
+        TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee),
+        TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f)  },
+      { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a),
+        TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838),
+        TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9)  }
+    };
+    /* ...
+     * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0)
+     */
+    static const vec384 isogeny_map_y_den[] = { /*  (k_(4,*)<<384) % P  */
+      { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60),
+        TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323),
+        TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1)  },
+      { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911),
+        TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658),
+        TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e)  },
+      { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742),
+        TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d),
+        TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41)  },
+      { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f),
+        TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806),
+        TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd)  },
+      { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700),
+        TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4),
+        TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb)  },
+      { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55),
+        TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc),
+        TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4)  },
+      { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0),
+        TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3),
+        TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed)  },
+      { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930),
+        TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6),
+        TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406)  },
+      { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497),
+        TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb),
+        TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5)  },
+      { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30),
+        TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023),
+        TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff)  },
+      { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b),
+        TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8),
+        TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e)  },
+      { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03),
+        TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45),
+        TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff)  },
+      { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154),
+        TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153),
+        TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c)  },
+      { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55),
+        TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc),
+        TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6)  },
+      { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21),
+        TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467),
+        TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e)  }
+    };
+    vec384 Zz_powers[15], map[15], xn, xd, yn, yd;
+
+    /* lay down Z^2 powers in descending order                          */
+    sqr_fp(Zz_powers[14], p->Z);                        /* ZZ^1         */
+#ifdef __OPTIMIZE_SIZE__
+    for (size_t i = 14; i > 0; i--)
+        mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]);
+#else
+    sqr_fp(Zz_powers[13], Zz_powers[14]);               /* ZZ^2  1+1    */
+    mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3  2+1    */
+    sqr_fp(Zz_powers[11], Zz_powers[13]);               /* ZZ^4  2+2    */
+    mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5  2+3    */
+    sqr_fp(Zz_powers[9],  Zz_powers[12]);               /* ZZ^6  3+3    */
+    mul_fp(Zz_powers[8],  Zz_powers[12], Zz_powers[11]);/* ZZ^7  3+4    */
+    sqr_fp(Zz_powers[7],  Zz_powers[11]);               /* ZZ^8  4+4    */
+    mul_fp(Zz_powers[6],  Zz_powers[11], Zz_powers[10]);/* ZZ^9  4+5    */
+    sqr_fp(Zz_powers[5],  Zz_powers[10]);               /* ZZ^10 5+5    */
+    mul_fp(Zz_powers[4],  Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6    */
+    sqr_fp(Zz_powers[3],  Zz_powers[9]);                /* ZZ^12 6+6    */
+    mul_fp(Zz_powers[2],  Zz_powers[9],  Zz_powers[8]); /* ZZ^13 6+7    */
+    sqr_fp(Zz_powers[1],  Zz_powers[8]);                /* ZZ^14 7+7    */
+    mul_fp(Zz_powers[0],  Zz_powers[8],  Zz_powers[7]); /* ZZ^15 7+8    */
+#endif
+
+    map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11);
+    mul_fp(xn, p->X, isogeny_map_x_num[11]);
+    add_fp(xn, xn, map[10]);
+    map_fp(xn, p->X, map, 10);
+
+    map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10);
+    add_fp(xd, p->X, map[9]);
+    map_fp(xd, p->X, map, 9);
+    mul_fp(xd, xd, Zz_powers[14]);      /* xd *= Z^2                    */
+
+    map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15);
+    mul_fp(yn, p->X, isogeny_map_y_num[15]);
+    add_fp(yn, yn, map[14]);
+    map_fp(yn, p->X, map, 14);
+    mul_fp(yn, yn, p->Y);               /* yn *= Y                      */
+
+    map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15);
+    add_fp(yd, p->X, map[14]);
+    map_fp(yd, p->X, map, 14);
+    mul_fp(Zz_powers[14], Zz_powers[14], p->Z);
+    mul_fp(yd, yd, Zz_powers[14]);      /* yd *= Z^3                    */
+
+    /* convert (xn, xd, yn, yd) to Jacobian coordinates                 */
+    mul_fp(out->Z, xd, yd);             /* Z = xd * yd                  */
+    mul_fp(out->X, xn, yd);
+    mul_fp(out->X, out->X, out->Z);     /* X = xn * xd * yd^2           */
+    sqr_fp(out->Y, out->Z);
+    mul_fp(out->Y, out->Y, xd);
+    mul_fp(out->Y, out->Y, yn);         /* Y = yn * xd^3 * yd^2         */
+}
+
+static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u)
+{
+    static const vec384 minus_A = { /* P - A */
+        TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c),
+        TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442),
+        TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915)
+    };
+    static const vec384 Z = {       /* (11<<384) % P */
+        TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d),
+        TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740),
+        TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8)
+    };
+    static const vec384 sqrt_minus_ZZZ = {
+        TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2),
+        TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b),
+        TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff)
+    };
+    static const vec384 ZxA = {
+        TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65),
+        TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1),
+        TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c)
+    };
+    vec384 uu, tv2, x2n, gx1, gxd, y2;
+#if 0
+    vec384 xn, x1n, xd, y, y1, Zuu, tv4;
+#else
+# define xn     p->X
+# define y      p->Y
+# define xd     p->Z
+# define x1n    xn
+# define y1     y
+# define Zuu    x2n
+# define tv4    y1
+#endif
+#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1)
+    bool_t e1, e2;
+
+    /*
+     * as per map_to_curve() from poc/sswu_opt.sage at
+     * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve
+     */
+    /* x numerator variants                                             */
+    sqr_fp(uu, u);                      /* uu = u^2                     */
+    mul_fp(Zuu, Z, uu);                 /* Zuu = Z * uu                 */
+    sqr_fp(tv2, Zuu);                   /* tv2 = Zuu^2                  */
+    add_fp(tv2, tv2, Zuu);              /* tv2 = tv2 + Zuu              */
+    add_fp(x1n, tv2, BLS12_381_Rx.p);   /* x1n = tv2 + 1                */
+    mul_fp(x1n, x1n, Bprime_E1);        /* x1n = x1n * B                */
+    mul_fp(x2n, Zuu, x1n);              /* x2n = Zuu * x1n              */
+
+    /* x denumenator                                                    */
+    mul_fp(xd, minus_A, tv2);           /* xd = -A * tv2                */
+    e1 = vec_is_zero(xd, sizeof(xd));   /* e1 = xd == 0                 */
+    vec_select(xd, ZxA, xd, sizeof(xd), e1);    /*              # If xd == 0, set xd = Z*A */
+
+    /* y numerators variants                                            */
+    sqr_fp(tv2, xd);                    /* tv2 = xd^2                   */
+    mul_fp(gxd, xd, tv2);               /* gxd = xd^3                   */
+    mul_fp(tv2, Aprime_E1, tv2);        /* tv2 = A * tv2                */
+    sqr_fp(gx1, x1n);                   /* gx1 = x1n^2                  */
+    add_fp(gx1, gx1, tv2);              /* gx1 = gx1 + tv2      # x1n^2 + A*xd^2 */
+    mul_fp(gx1, gx1, x1n);              /* gx1 = gx1 * x1n      # x1n^3 + A*x1n*xd^2 */
+    mul_fp(tv2, Bprime_E1, gxd);        /* tv2 = B * gxd                */
+    add_fp(gx1, gx1, tv2);              /* gx1 = gx1 + tv2      # x1^3 + A*x1*xd^2 + B*xd^3 */
+    sqr_fp(tv4, gxd);                   /* tv4 = gxd^2                  */
+    mul_fp(tv2, gx1, gxd);              /* tv2 = gx1 * gxd              */
+    mul_fp(tv4, tv4, tv2);              /* tv4 = tv4 * tv2      # gx1*gxd^3 */
+    e2 = recip_sqrt_fp(y1, tv4);        /* y1 = tv4^c1          # (gx1*gxd^3)^((p-3)/4) */
+    mul_fp(y1, y1, tv2);                /* y1 = y1 * tv2        # gx1*gxd*y1 */
+    mul_fp(y2, y1, sqrt_minus_ZZZ);     /* y2 = y1 * c2         # y2 = y1*sqrt(-Z^3) */
+    mul_fp(y2, y2, uu);                 /* y2 = y2 * uu                 */
+    mul_fp(y2, y2, u);                  /* y2 = y2 * u                  */
+
+    /* choose numerators                                                */
+    vec_select(xn, x1n, x2n, sizeof(xn), e2);   /* xn = e2 ? x1n : x2n  */
+    vec_select(y, y1, y2, sizeof(y), e2);       /* y  = e2 ? y1 : y2    */
+
+    e1 = sgn0_fp(u);
+    e2 = sgn0_fp(y);
+    cneg_fp(y, y, e1^e2);               /* fix sign of y                */
+                                        /* return (xn, xd, y, 1)        */
+
+    /* convert (xn, xd, y, 1) to Jacobian projective coordinates        */
+    mul_fp(p->X, xn, xd);               /* X = xn * xd                  */
+    mul_fp(p->Y, y, gxd);               /* Y = y * xd^3                 */
+#ifndef xd
+    vec_copy(p->Z, xd, sizeof(xd));     /* Z = xd                       */
+#else
+# undef xn
+# undef y
+# undef xd
+# undef x1n
+# undef y1
+# undef Zuu
+# undef tv4
+#endif
+#undef sgn0_fp
+}
+
+static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n)
+{
+    POINTonE1_dadd(out, out, p, NULL);
+    while(n--)
+        POINTonE1_double(out, out);
+}
+
+static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in)
+{
+    POINTonE1_double(out, in);          /*      1: 0x2                  */
+    POINTonE1_add_n_dbl(out, in, 2);    /*   2..4: 0x3..0xc             */
+    POINTonE1_add_n_dbl(out, in, 3);    /*   5..8: 0xd..0x68            */
+    POINTonE1_add_n_dbl(out, in, 9);    /*  9..18: 0x69..0xd200         */
+    POINTonE1_add_n_dbl(out, in, 32);   /* 19..51: ..0xd20100000000     */
+    POINTonE1_add_n_dbl(out, in, 16);   /* 52..68: ..0xd201000000010000 */
+}
+
+/*
+ * |u|, |v| are expected to be in Montgomery representation
+ */
+static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v)
+{
+    POINTonE1 p;
+
+    map_to_isogenous_E1(&p, u);
+
+    if (v != NULL) {
+        map_to_isogenous_E1(out, v);    /* borrow |out|                 */
+        POINTonE1_dadd(&p, &p, out, Aprime_E1);
+    }
+
+    isogeny_map_to_E1(&p, &p);          /* sprinkle isogenous powder    */
+
+    /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */
+    POINTonE1_times_minus_z(out, &p);
+    POINTonE1_dadd(out, out, &p, NULL);
+}
+
+void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v)
+{   map_to_g1(out, u, v);   }
+
+static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                       const unsigned char *DST, size_t DST_len,
+                                       const unsigned char *aug, size_t aug_len)
+{
+    vec384 u[1];
+
+    hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g1(p, u[0], NULL);
+}
+
+void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{   Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{
+    vec384 u[2];
+
+    hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g1(p, u[0], u[1]);
+}
+
+void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len,
+                                   const unsigned char *DST, size_t DST_len,
+                                   const unsigned char *aug, size_t aug_len)
+{   Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void sigma(POINTonE1 *out, const POINTonE1 *in);
+
+#if 0
+#ifdef __OPTIMIZE_SIZE__
+static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out,
+                                                const POINTonE1 *in)
+{
+    static const byte zz_minus_1_div_by_3[] = {
+        TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156)
+    };
+    size_t n = 126-1;
+    const POINTonE1 *dblin = in;
+
+    while(n--) {
+        POINTonE1_double(out, dblin);   dblin = out;
+        if (is_bit_set(zz_minus_1_div_by_3, n))
+            POINTonE1_dadd(out, out, in, NULL);
+    }
+}
+#else
+static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p)
+{
+    while(n--)
+        POINTonE1_double(out, out);
+    POINTonE1_dadd(out, out, p, NULL);
+}
+
+static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out,
+                                                const POINTonE1 *in)
+{
+    POINTonE1 t3, t5, t7, t11, t85;
+
+    POINTonE1_double(&t7, in);              /* 2P */
+    POINTonE1_dadd(&t3, &t7, in, NULL);     /* 3P */
+    POINTonE1_dadd(&t5, &t3, &t7, NULL);    /* 5P */
+    POINTonE1_dadd(&t7, &t5, &t7, NULL);    /* 7P */
+    POINTonE1_double(&t85, &t5);            /* 10P */
+    POINTonE1_dadd(&t11, &t85, in, NULL);   /* 11P */
+    POINTonE1_dbl_n_add(&t85, 3, &t5);      /* 0x55P */
+                                            /* (-0xd201000000010000^2 - 1) / 3 */
+    POINTonE1_double(out, &t7);             /* 0xe */
+    POINTonE1_dbl_n_add(out, 5,  &t11);     /* 0x1cb */
+    POINTonE1_dbl_n_add(out, 3,  &t3);      /* 0xe5b */
+    POINTonE1_dbl_n_add(out, 3,  in);       /* 0x72d9 */
+    POINTonE1_dbl_n_add(out, 5,  &t3);      /* 0xe5b23 */
+    POINTonE1_dbl_n_add(out, 18, &t85);     /* 0x396c8c0055 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555 */
+    POINTonE1_dbl_n_add(out, 3,  &t7);      /* 0x1cb646002aaaf */
+    POINTonE1_dbl_n_add(out, 7,  &t5);      /* 0xe5b23001555785 */
+    POINTonE1_dbl_n_add(out, 5,  &t11);     /* 0x1cb646002aaaf0ab */
+    POINTonE1_dbl_n_add(out, 41, &t85);     /* 0x396c8c005555e1560000000055 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e156000000005555 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e15600000000555555 */
+    POINTonE1_dbl_n_add(out, 8,  &t85);     /* 0x396c8c005555e1560000000055555555 */
+}
+#endif
+
+static bool_t POINTonE1_in_G1(const POINTonE1 *P)
+{
+    POINTonE1 t0, t1, t2;
+
+    /* Bowe, S., "Faster subgroup checks for BLS12-381"                   */
+    sigma(&t0, P);                        /* σ(P)                         */
+    sigma(&t1, &t0);                      /* σ²(P)                        */
+
+    POINTonE1_double(&t0, &t0);           /* 2σ(P)                        */
+    POINTonE1_dadd(&t2, &t1, P, NULL);    /* P +  σ²(P)                   */
+    POINTonE1_cneg(&t2, 1);               /* - P - σ²(P)                  */
+    POINTonE1_dadd(&t2, &t2, &t0, NULL);  /* 2σ(P) - P - σ²(P)            */
+    POINTonE1_times_zz_minus_1_div_by_3(  &t0, &t2);
+    POINTonE1_cneg(&t1, 1);
+    POINTonE1_dadd(&t0, &t0, &t1, NULL);  /* [(z²-1)/3](2σ(P) - P - σ²(P)) */
+                                          /* - σ²(P) */
+    return vec_is_zero(t0.Z, sizeof(t0.Z));
+}
+#else
+static bool_t POINTonE1_in_G1(const POINTonE1 *P)
+{
+    POINTonE1 t0, t1;
+
+    /* Scott, M., https://eprint.iacr.org/2021/1130 */
+    POINTonE1_times_minus_z(&t0, P);
+    POINTonE1_times_minus_z(&t1, &t0);
+    POINTonE1_cneg(&t1, 1);             /* [-z²]P   */
+
+    sigma(&t0, P);                      /* σ(P)     */
+    sigma(&t0, &t0);                    /* σ²(P)    */
+
+    return POINTonE1_is_equal(&t0, &t1);
+}
+#endif
+
+int blst_p1_in_g1(const POINTonE1 *p)
+{   return (int)POINTonE1_in_G1(p);   }
+
+int blst_p1_affine_in_g1(const POINTonE1_affine *p)
+{
+    POINTonE1 P;
+
+    vec_copy(P.X, p->X, 2*sizeof(P.X));
+    vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z),
+                     vec_is_zero(p, sizeof(*p)));
+
+    return (int)POINTonE1_in_G1(&P);
+}
diff --git a/crypto/blst_src/map_to_g2.c b/crypto/blst_src/map_to_g2.c
new file mode 100644
index 00000000000..90fd86e9d31
--- /dev/null
+++ b/crypto/blst_src/map_to_g2.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * y^2 = x^3 + A'*x + B', isogenous one
+ */
+static const vec384x Aprime_E2 = {      /* 240*i */
+  { 0 },
+  { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285),
+    TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601),
+    TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }
+};
+static const vec384x Bprime_E2 = {      /* 1012 + 1012*i */
+  { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4),
+    TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba),
+    TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) },
+  { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4),
+    TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba),
+    TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }
+};
+
+static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[],
+                             const vec384x Zz_powers[], size_t n)
+{
+    while (n--)
+        mul_fp2(map[n], isogeny_map[n], Zz_powers[n]);
+}
+
+static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n)
+{
+    while (n--) {
+        mul_fp2(acc, acc, x);
+        add_fp2(acc, acc, map[n]);
+    }
+}
+
+static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p)
+{
+    /*
+     * x = x_num / x_den, where
+     * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0)
+     * ...
+     */
+    static const vec384x isogeny_map_x_num[] = {    /* (k_(1,*)<<384) % P   */
+     {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e),
+        TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062),
+        TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) },
+      { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e),
+        TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062),
+        TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }},
+     {{ 0 },
+      { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3),
+        TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945),
+        TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }},
+     {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae),
+        TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c),
+        TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) },
+      { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551),
+        TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2),
+        TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }},
+     {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e),
+        TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1),
+        TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) },
+      { 0 }}
+    };
+    /* ...
+     * x_den = x'^2 + k_(2,1) * x' + k_(2,0)
+     */
+    static const vec384x isogeny_map_x_den[] = {    /* (k_(2,*)<<384) % P   */
+     {{ 0 },
+      { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e),
+        TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18),
+        TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }},
+     {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020),
+        TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6),
+        TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) },
+      { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf),
+        TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8),
+        TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }}
+    };
+    /*
+     * y = y' * y_num / y_den, where
+     * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0)
+     * ...
+     */
+    static const vec384x isogeny_map_y_num[] = {    /* (k_(3,*)<<384) % P   */
+     {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2),
+        TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1),
+        TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) },
+      { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2),
+        TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1),
+        TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }},
+     {{ 0 },
+      { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd),
+        TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5),
+        TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }},
+     {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8),
+        TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251),
+        TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) },
+      { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556),
+        TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e),
+        TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }},
+     {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425),
+        TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e),
+        TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) },
+      { 0 }}
+    };
+    /* ...
+     * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0)
+     */
+    static const vec384x isogeny_map_y_den[] = {    /* (k_(4,*)<<384) % P   */
+     {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75),
+        TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5),
+        TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) },
+      { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75),
+        TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5),
+        TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }},
+     {{ 0 },
+      { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba),
+        TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a),
+        TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }},
+     {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030),
+        TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9),
+        TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) },
+      { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf),
+        TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915),
+        TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }}
+    };
+    vec384x Zz_powers[3], map[3], xn, xd, yn, yd;
+
+    /* lay down Z^2 powers in descending order                          */
+    sqr_fp2(Zz_powers[2], p->Z);                       /* ZZ^1          */
+    sqr_fp2(Zz_powers[1], Zz_powers[2]);               /* ZZ^2  1+1     */
+    mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3  2+1     */
+
+    map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3);
+    mul_fp2(xn, p->X, isogeny_map_x_num[3]);
+    add_fp2(xn, xn, map[2]);
+    map_fp2(xn, p->X, map, 2);
+
+    map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2);
+    add_fp2(xd, p->X, map[1]);
+    map_fp2(xd, p->X, map, 1);
+    mul_fp2(xd, xd, Zz_powers[2]);      /* xd *= Z^2                    */
+
+    map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3);
+    mul_fp2(yn, p->X, isogeny_map_y_num[3]);
+    add_fp2(yn, yn, map[2]);
+    map_fp2(yn, p->X, map, 2);
+    mul_fp2(yn, yn, p->Y);              /* yn *= Y                      */
+
+    map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3);
+    add_fp2(yd, p->X, map[2]);
+    map_fp2(yd, p->X, map, 2);
+    mul_fp2(Zz_powers[2], Zz_powers[2], p->Z);
+    mul_fp2(yd, yd, Zz_powers[2]);      /* yd *= Z^3                    */
+
+    /* convert (xn, xd, yn, yd) to Jacobian coordinates                 */
+    mul_fp2(out->Z, xd, yd);            /* Z = xd * yd                  */
+    mul_fp2(out->X, xn, yd);
+    mul_fp2(out->X, out->X, out->Z);    /* X = xn * xd * yd^2           */
+    sqr_fp2(out->Y, out->Z);
+    mul_fp2(out->Y, out->Y, xd);
+    mul_fp2(out->Y, out->Y, yn);        /* Y = yn * xd^3 * yd^2         */
+}
+
+static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u)
+{
+    static const vec384x minus_A = {
+      { 0 },
+      { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79),
+        TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd),
+        TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) }
+    };
+    static const vec384x Z = {              /* -2 - i */
+      { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa),
+        TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4),
+        TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) },
+      { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd),
+        TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a),
+        TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) }
+    };
+    static const vec384x recip_ZZZ = {      /* 1/(Z^3) */
+      { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916),
+        TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f),
+        TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) },
+      { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604),
+        TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7),
+        TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) }
+    };
+    static const vec384x magic_ZZZ = {      /* 1/Z^3 = a + b*i */
+                                            /* a^2 + b^2 */
+      { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374),
+        TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7),
+        TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) },
+                                            /* (a^2 + b^2)^((P-3)/4) */
+      { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19),
+        TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff),
+        TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) }
+    };
+    static const vec384x ZxA = {            /* 240 - 480*i */
+      { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285),
+        TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601),
+        TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) },
+      { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3),
+        TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a),
+        TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) }
+    };
+    vec384x uu, tv2, tv4, x2n, gx1, gxd, y2;
+#if 0
+    vec384x xn, x1n, xd, y, y1, Zuu;
+#else
+# define xn     p->X
+# define y      p->Y
+# define xd     p->Z
+# define x1n    xn
+# define y1     y
+# define Zuu    x2n
+#endif
+#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1)
+    bool_t e1, e2;
+
+    /*
+     * as per map_to_curve() from poc/sswu_opt.sage at
+     * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve
+     * with 9mod16 twists...
+     */
+    /* x numerator variants                                             */
+    sqr_fp2(uu, u);                     /* uu = u^2                     */
+    mul_fp2(Zuu, Z, uu);                /* Zuu = Z * uu                 */
+    sqr_fp2(tv2, Zuu);                  /* tv2 = Zuu^2                  */
+    add_fp2(tv2, tv2, Zuu);             /* tv2 = tv2 + Zuu              */
+    add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1                */
+    mul_fp2(x1n, x1n, Bprime_E2);       /* x1n = x1n * B                */
+    mul_fp2(x2n, Zuu, x1n);             /* x2n = Zuu * x1n              */
+
+    /* x denumenator                                                    */
+    mul_fp2(xd, minus_A, tv2);          /* xd = -A * tv2                */
+    e1 = vec_is_zero(xd, sizeof(xd));   /* e1 = xd == 0                 */
+    vec_select(xd, ZxA, xd, sizeof(xd), e1);    /*              # If xd == 0, set xd = Z*A */
+
+    /* y numerators variants                                            */
+    sqr_fp2(tv2, xd);                   /* tv2 = xd^2                   */
+    mul_fp2(gxd, xd, tv2);              /* gxd = xd^3                   */
+    mul_fp2(tv2, Aprime_E2, tv2);       /* tv2 = A * tv2                */
+    sqr_fp2(gx1, x1n);                  /* gx1 = x1n^2                  */
+    add_fp2(gx1, gx1, tv2);             /* gx1 = gx1 + tv2      # x1n^2 + A*xd^2 */
+    mul_fp2(gx1, gx1, x1n);             /* gx1 = gx1 * x1n      # x1n^3 + A*x1n*xd^2 */
+    mul_fp2(tv2, Bprime_E2, gxd);       /* tv2 = B * gxd                */
+    add_fp2(gx1, gx1, tv2);             /* gx1 = gx1 + tv2      # x1^3 + A*x1*xd^2 + B*xd^3 */
+    sqr_fp2(tv4, gxd);                  /* tv4 = gxd^2                  */
+    mul_fp2(tv2, gx1, gxd);             /* tv2 = gx1 * gxd              */
+    mul_fp2(tv4, tv4, tv2);             /* tv4 = tv4 * tv2      # gx1*gxd^3 */
+    e2 = recip_sqrt_fp2(y1, tv4,        /* y1 = tv4^c1          # (gx1*gxd^3)^((p^2-9)/16) */
+                        recip_ZZZ, magic_ZZZ);
+    mul_fp2(y1, y1, tv2);               /* y1 = y1 * tv2        # gx1*gxd*y1 */
+    mul_fp2(y2, y1, uu);                /* y2 = y1 * uu                 */
+    mul_fp2(y2, y2, u);                 /* y2 = y2 * u                  */
+
+    /* choose numerators                                                */
+    vec_select(xn, x1n, x2n, sizeof(xn), e2);   /* xn = e2 ? x1n : x2n  */
+    vec_select(y, y1, y2, sizeof(y), e2);       /* y  = e2 ? y1 : y2    */
+
+    e1 = sgn0_fp2(u);
+    e2 = sgn0_fp2(y);
+    cneg_fp2(y, y, e1^e2);              /* fix sign of y                */
+                                        /* return (xn, xd, y, 1)        */
+
+    /* convert (xn, xd, y, 1) to Jacobian projective coordinates        */
+    mul_fp2(p->X, xn, xd);              /* X = xn * xd                  */
+    mul_fp2(p->Y, y, gxd);              /* Y = y * xd^3                 */
+#ifndef xd
+    vec_copy(p->Z, xd, sizeof(xd));     /* Z = xd                       */
+#else
+# undef xn
+# undef y
+# undef xd
+# undef x1n
+# undef y1
+# undef Zuu
+# undef tv4
+#endif
+#undef sgn0_fp2
+}
+
+#if 0
+static const byte h_eff[] = {
+    TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4),
+    TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a),
+    TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95),
+    TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768),
+    TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3)
+};
+
+static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p)
+{    POINTonE2_mult_w5(out, p, h_eff, 636);   }
+#else
+/*
+ * As per suggestions in "7. Clearing the cofactor" at
+ * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06
+ */
+static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n)
+{
+    POINTonE2_dadd(out, out, p, NULL);
+    while(n--)
+        POINTonE2_double(out, out);
+}
+
+static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in)
+{
+    POINTonE2_double(out, in);          /*      1: 0x2                  */
+    POINTonE2_add_n_dbl(out, in, 2);    /*   2..4: 0x3..0xc             */
+    POINTonE2_add_n_dbl(out, in, 3);    /*   5..8: 0xd..0x68            */
+    POINTonE2_add_n_dbl(out, in, 9);    /*  9..18: 0x69..0xd200         */
+    POINTonE2_add_n_dbl(out, in, 32);   /* 19..51: ..0xd20100000000     */
+    POINTonE2_add_n_dbl(out, in, 16);   /* 52..68: ..0xd201000000010000 */
+}
+
+static void psi(POINTonE2 *out, const POINTonE2 *in);
+
+static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p)
+{
+    POINTonE2 t0, t1;
+
+    /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves"  */
+    POINTonE2_double(out, p);           /* out = 2P                     */
+    psi(out, out);                      /* out = Ψ(2P)                  */
+    psi(out, out);                      /* out = Ψ²(2P)                 */
+
+    vec_copy(&t0, p, sizeof(t0));
+    POINTonE2_cneg(&t0, 1);             /* t0 = -P                      */
+    psi(&t1, &t0);                      /* t1 = -Ψ(P)                   */
+    POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P             */
+    POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P)      */
+
+    POINTonE2_times_minus_z(&t0, p);    /* t0 = [-z]P                   */
+    POINTonE2_dadd(&t0, &t0, p, NULL);  /* t0 = [-z + 1]P               */
+    POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P)        */
+    POINTonE2_times_minus_z(&t1, &t0);  /* t1 = [z² - z]P + [z]Ψ(P)     */
+    POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P          */
+                                        /*     + [z - 1]Ψ(P)            */
+                                        /*     + Ψ²(2P)                 */
+}
+#endif
+
+/*
+ * |u|, |v| are expected to be in Montgomery representation
+ */
+static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v)
+{
+    POINTonE2 p;
+
+    map_to_isogenous_E2(&p, u);
+
+    if (v != NULL) {
+        map_to_isogenous_E2(out, v);    /* borrow |out|                 */
+        POINTonE2_dadd(&p, &p, out, Aprime_E2);
+    }
+
+    isogeny_map_to_E2(&p, &p);          /* sprinkle isogenous powder    */
+    clear_cofactor(out, &p);
+}
+
+void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v)
+{   map_to_g2(out, u, v);   }
+
+static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                       const unsigned char *DST, size_t DST_len,
+                                       const unsigned char *aug, size_t aug_len)
+{
+    vec384x u[1];
+
+    hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g2(p, u[0], NULL);
+}
+
+void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{   Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                     const unsigned char *DST, size_t DST_len,
+                                     const unsigned char *aug, size_t aug_len)
+{
+    vec384x u[2];
+
+    hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len);
+    map_to_g2(p, u[0], u[1]);
+}
+
+void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len,
+                                   const unsigned char *DST, size_t DST_len,
+                                   const unsigned char *aug, size_t aug_len)
+{   Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len);   }
+
+static bool_t POINTonE2_in_G2(const POINTonE2 *P)
+{
+#if 0
+    POINTonE2 t0, t1, t2;
+
+    /* Bowe, S., "Faster subgroup checks for BLS12-381"                 */
+    psi(&t0, P);                        /* Ψ(P)                         */
+    psi(&t0, &t0);                      /* Ψ²(P)                        */
+    psi(&t1, &t0);                      /* Ψ³(P)                        */
+
+    POINTonE2_times_minus_z(&t2, &t1);
+    POINTonE2_dadd(&t0, &t0, &t2, NULL);
+    POINTonE2_cneg(&t0, 1);
+    POINTonE2_dadd(&t0, &t0, P, NULL);  /* [z]Ψ³(P) - Ψ²(P) + P         */
+
+    return vec_is_zero(t0.Z, sizeof(t0.Z));
+#else
+    POINTonE2 t0, t1;
+
+    /* Scott, M., https://eprint.iacr.org/2021/1130 */
+    psi(&t0, P);                            /* Ψ(P) */
+
+    POINTonE2_times_minus_z(&t1, P);
+    POINTonE2_cneg(&t1, 1);                 /* [z]P */
+
+    return POINTonE2_is_equal(&t0, &t1);
+#endif
+}
+
+int blst_p2_in_g2(const POINTonE2 *p)
+{   return (int)POINTonE2_in_G2(p);   }
+
+int blst_p2_affine_in_g2(const POINTonE2_affine *p)
+{
+    POINTonE2 P;
+
+    vec_copy(P.X, p->X, 2*sizeof(P.X));
+    vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z),
+                     vec_is_zero(p, sizeof(*p)));
+
+    return (int)POINTonE2_in_G2(&P);
+}
diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c
new file mode 100644
index 00000000000..d0b3deefe25
--- /dev/null
+++ b/crypto/blst_src/multi_scalar.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+#include "point.h"
+
+/*
+ * Infinite point among inputs would be devastating. Shall we change it?
+ */
+#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \
+static void ptype##s_to_affine(ptype##_affine dst[], \
+                               const ptype *const points[], size_t npoints) \
+{ \
+    size_t i; \
+    vec##bits *acc, ZZ, ZZZ; \
+    const ptype *point = NULL; \
+    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \
+\
+    while (npoints) { \
+        const ptype *p, *const *walkback; \
+        size_t delta = stride<npoints ? stride : npoints; \
+\
+        point = *points ? *points++ : point+1; \
+        acc = (vec##bits *)dst; \
+        vec_copy(acc++, point->Z, sizeof(vec##bits)); \
+        for (i = 1; i < delta; i++, acc++) \
+            point = *points ? *points++ : point+1, \
+            mul_##field(acc[0], acc[-1], point->Z); \
+\
+        --acc; reciprocal_##field(acc[0], acc[0]); \
+\
+        walkback = points-1, p = point, --delta, dst += delta; \
+        for (i = 0; i < delta; i++, acc--, dst--) { \
+            mul_##field(acc[-1], acc[-1], acc[0]);  /* 1/Z        */\
+            sqr_##field(ZZ, acc[-1]);               /* 1/Z^2      */\
+            mul_##field(ZZZ, ZZ, acc[-1]);          /* 1/Z^3      */\
+            mul_##field(acc[-1], p->Z, acc[0]);     \
+            mul_##field(dst->X,  p->X, ZZ);         /* X = X'/Z^2 */\
+            mul_##field(dst->Y,  p->Y, ZZZ);        /* Y = Y'/Z^3 */\
+            p = (p == *walkback) ? *--walkback : p-1; \
+        } \
+        sqr_##field(ZZ, acc[0]);                    /* 1/Z^2      */\
+        mul_##field(ZZZ, ZZ, acc[0]);               /* 1/Z^3      */\
+        mul_##field(dst->X, p->X, ZZ);              /* X = X'/Z^2 */\
+        mul_##field(dst->Y, p->Y, ZZZ);             /* Y = Y'/Z^3 */\
+        ++delta, dst += delta, npoints -= delta; \
+    } \
+} \
+\
+void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \
+                         size_t npoints) \
+{   ptype##s_to_affine(dst, points, npoints);   }
+
+POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp)
+POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2)
+
+/*
+ * This is two-step multi-scalar multiplication procedure. First, given
+ * a set of points you pre-compute a table for chosen windowing factor
+ * [expressed in bits with value between 2 and 14], and then you pass
+ * this table to the actual multiplication procedure along with scalars.
+ * Idea is that the pre-computed table will be reused multiple times. In
+ * which case multiplication runs faster than below Pippenger algorithm
+ * implementation for up to ~16K points for wbits=8, naturally at the
+ * expense of multi-megabyte table. One can trade even more memory for
+ * performance, but each wbits increment doubles the memory requirement,
+ * so at some point it gets prohibively large... For reference, without
+ * reusing the table it's faster than Pippenger algorithm for up ~32
+ * points [with wbits=5]...
+ */
+
+#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096)
+
+#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \
+static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \
+                                         const ptype##_affine *point) \
+{ \
+    size_t i, j, n = (size_t)1 << (wbits-1); \
+                                          /* row[-1] is implicit infinity */\
+    vec_copy(&row[0], point, sizeof(*point));           /* row[0]=p*1     */\
+    vec_copy(&row[0].Z, one, sizeof(row[0].Z));                             \
+    ptype##_double(&row[1],  &row[0]);                  /* row[1]=p*(1+1) */\
+    for (i = 2, j = 1; i < n; i += 2, j++) \
+        ptype##_add_affine(&row[i], &row[i-1], point),  /* row[2]=p*(2+1) */\
+        ptype##_double(&row[i+1], &row[j]);             /* row[3]=p*(2+2) */\
+}                                                       /* row[4] ...     */\
+\
+static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \
+                                         size_t wbits, size_t npoints) \
+{ \
+    size_t total = npoints << (wbits-1); \
+    size_t nwin = (size_t)1 << (wbits-1); \
+    size_t i, j; \
+    vec##bits *acc, ZZ, ZZZ; \
+\
+    src += total; \
+    acc = (vec##bits *)src; \
+    vec_copy(acc++, one, sizeof(vec##bits)); \
+    for (i = 0; i < npoints; i++) \
+        for (j = nwin; --src, --j; acc++)    \
+            mul_##field(acc[0], acc[-1], src->Z); \
+\
+    --acc; reciprocal_##field(acc[0], acc[0]); \
+\
+    for (i = 0; i < npoints; i++) { \
+        vec_copy(dst++, src++, sizeof(ptype##_affine)); \
+        for (j = 1; j < nwin; j++, acc--, src++, dst++) { \
+            mul_##field(acc[-1], acc[-1], acc[0]);  /* 1/Z        */\
+            sqr_##field(ZZ, acc[-1]);               /* 1/Z^2      */\
+            mul_##field(ZZZ, ZZ, acc[-1]);          /* 1/Z^3      */\
+            mul_##field(acc[-1], src->Z, acc[0]);                   \
+            mul_##field(dst->X, src->X, ZZ);        /* X = X'/Z^2 */\
+            mul_##field(dst->Y, src->Y, ZZZ);       /* Y = Y'/Z^3 */\
+        } \
+    } \
+} \
+\
+/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\
+static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \
+                                      const ptype##_affine *const points[], \
+                                      size_t npoints) \
+{ \
+    size_t total = npoints << (wbits-1); \
+    size_t nwin = (size_t)1 << (wbits-1); \
+    size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \
+    size_t i, top = 0; \
+    ptype *rows, *row; \
+    const ptype##_affine *point = NULL; \
+    size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \
+    if (stride == 0) stride = 1; \
+\
+    while (npoints >= nmin) { \
+        size_t limit = total - npoints; \
+\
+        if (top + (stride << wbits) > limit) { \
+            stride = (limit - top) >> wbits;   \
+            if (stride == 0) break;            \
+        } \
+        rows = row = (ptype *)(&table[top]); \
+        for (i = 0; i < stride; i++, row += nwin) \
+            point = *points ? *points++ : point+1, \
+            ptype##_precompute_row_wbits(row, wbits, point); \
+        ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \
+        top += stride << (wbits-1); \
+        npoints -= stride; \
+    } \
+    rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \
+    for (i = 0; i < npoints; i++, row += nwin) \
+        point = *points ? *points++ : point+1, \
+        ptype##_precompute_row_wbits(row, wbits, point); \
+    ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \
+} \
+\
+size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \
+{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \
+void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \
+                                     const ptype##_affine *const points[], \
+                                     size_t npoints) \
+{ ptype##s_precompute_wbits(table, wbits, points, npoints); }
+
+#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \
+static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \
+                                       size_t wbits, limb_t booth_idx) \
+{ \
+    bool_t booth_sign = (booth_idx >> wbits) & 1; \
+    bool_t idx_is_zero; \
+    static const ptype##_affine infinity = { 0 }; \
+\
+    booth_idx &= ((limb_t)1 << wbits) - 1; \
+    idx_is_zero = is_zero(booth_idx); \
+    booth_idx -= 1 ^ idx_is_zero; \
+    vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \
+    ptype##_cneg(p, booth_sign); \
+} \
+\
+static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \
+                                size_t wbits, size_t npoints, \
+                                const byte *const scalars[], size_t nbits, \
+                                ptype scratch[]) \
+{ \
+    limb_t wmask, wval; \
+    size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \
+    const byte *scalar, *const *scalar_s = scalars; \
+    const ptype##_affine *row = table; \
+\
+    size_t scratch_sz = SCRATCH_SZ(ptype); \
+    if (scratch == NULL) { \
+        scratch_sz /= 4; /* limit to 288K */ \
+        scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \
+        scratch = alloca(sizeof(ptype) * scratch_sz); \
+    } \
+\
+    nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \
+    scalar = *scalar_s++; \
+\
+    /* top excess bits modulo target window size */ \
+    window = nbits % wbits; /* yes, it may be zero */ \
+    wmask = ((limb_t)1 << (window + 1)) - 1; \
+\
+    nbits -= window; \
+    z = is_zero(nbits); \
+    wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \
+    wval = booth_encode(wval, wbits); \
+    ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \
+    row += nwin; \
+\
+    i = 1; vec_zero(ret, sizeof(*ret)); \
+    while (nbits > 0) { \
+        for (j = i; i < npoints; i++, j++, row += nwin) { \
+            if (j == scratch_sz) \
+                ptype##s_accumulate(ret, scratch, j), j = 0; \
+            scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+            wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \
+            wval = booth_encode(wval, wbits); \
+            ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \
+        } \
+        ptype##s_accumulate(ret, scratch, j); \
+\
+        for (j = 0; j < wbits; j++) \
+            ptype##_double(ret, ret); \
+\
+        window = wbits; \
+        wmask = ((limb_t)1 << (window + 1)) - 1; \
+        nbits -= window; \
+        i = 0; row = table; scalar_s = scalars; \
+    } \
+\
+    for (j = i; i < npoints; i++, j++, row += nwin) { \
+        if (j == scratch_sz) \
+            ptype##s_accumulate(ret, scratch, j), j = 0; \
+        scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \
+        wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \
+        wval = booth_encode(wval, wbits); \
+        ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \
+    } \
+    ptype##s_accumulate(ret, scratch, j); \
+} \
+\
+size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \
+{ \
+    const size_t scratch_sz = SCRATCH_SZ(ptype); \
+    return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \
+} \
+void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \
+                          size_t wbits, size_t npoints, \
+                          const byte *const scalars[], size_t nbits, \
+                          ptype scratch[]) \
+{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); }
+
+PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p)
+
+PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+
+/*
+ * Pippenger algorithm implementation, fastest option for larger amount
+ * of points...
+ */
+
+static size_t pippenger_window_size(size_t npoints)
+{
+    size_t wbits;
+
+    for (wbits=0; npoints>>=1; wbits++) ;
+
+    return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1));
+}
+
+#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \
+typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz;
+
+#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \
+static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \
+                                                  size_t wbits) \
+{ \
+    ptype##xyzz ret[1], acc[1]; \
+    size_t n = (size_t)1 << wbits; \
+\
+    /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\
+    vec_copy(acc, &buckets[--n], sizeof(acc)); \
+    vec_copy(ret, &buckets[n], sizeof(ret)); \
+    vec_zero(&buckets[n], sizeof(buckets[n])); \
+    while (n--) { \
+        ptype##xyzz_dadd(acc, acc, &buckets[n]); \
+        ptype##xyzz_dadd(ret, ret, acc); \
+        vec_zero(&buckets[n], sizeof(buckets[n])); \
+    } \
+    ptype##xyzz_to_Jacobian(out, ret); \
+} \
+\
+static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \
+                           size_t wbits, const ptype##_affine *p) \
+{ \
+    bool_t booth_sign = (booth_idx >> wbits) & 1; \
+\
+    booth_idx &= (1<<wbits) - 1; \
+    if (booth_idx--) \
+        ptype##xyzz_dadd_affine(&buckets[booth_idx], &buckets[booth_idx], \
+                                                     p, booth_sign); \
+} \
+\
+static void ptype##_prefetch(const ptype##xyzz buckets[], limb_t booth_idx, \
+                             size_t wbits) \
+{ \
+    booth_idx &= (1<<wbits) - 1; \
+    if (booth_idx--) \
+        vec_prefetch(&buckets[booth_idx], sizeof(buckets[booth_idx])); \
+} \
+\
+static void ptype##s_tile_pippenger(ptype *ret, \
+                                    const ptype##_affine *const points[], \
+                                    size_t npoints, \
+                                    const byte *const scalars[], size_t nbits, \
+                                    ptype##xyzz buckets[], \
+                                    size_t bit0, size_t wbits, size_t cbits) \
+{ \
+    limb_t wmask, wval, wnxt; \
+    size_t i, z, nbytes; \
+    const byte *scalar = *scalars++; \
+    const ptype##_affine *point = *points++; \
+\
+    nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \
+    wmask = ((limb_t)1 << (wbits+1)) - 1; \
+    z = is_zero(bit0); \
+    bit0 -= z^1; wbits += z^1; \
+    wval = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+    wval = booth_encode(wval, cbits); \
+    scalar = *scalars ? *scalars++ : scalar+nbytes; \
+    wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+    wnxt = booth_encode(wnxt, cbits); \
+    npoints--;  /* account for prefetch */ \
+\
+    ptype##_bucket(buckets, wval, cbits, point); \
+    for (i = 1; i < npoints; i++) { \
+        wval = wnxt; \
+        scalar = *scalars ? *scalars++ : scalar+nbytes; \
+        wnxt = (get_wval_limb(scalar, bit0, wbits) << z) & wmask; \
+        wnxt = booth_encode(wnxt, cbits); \
+        ptype##_prefetch(buckets, wnxt, cbits); \
+        point = *points ? *points++ : point+1; \
+        ptype##_bucket(buckets, wval, cbits, point); \
+    } \
+    point = *points ? *points++ : point+1; \
+    ptype##_bucket(buckets, wnxt, cbits, point); \
+    ptype##_integrate_buckets(ret, buckets, cbits - 1); \
+} \
+\
+static void ptype##s_mult_pippenger(ptype *ret, \
+                                    const ptype##_affine *const points[], \
+                                    size_t npoints, \
+                                    const byte *const scalars[], size_t nbits, \
+                                    ptype##xyzz buckets[], size_t window) \
+{ \
+    size_t i, wbits, cbits, bit0 = nbits; \
+    ptype tile[1]; \
+\
+    window = window ? window : pippenger_window_size(npoints); \
+    vec_zero(buckets, sizeof(buckets[0]) << (window-1)); \
+    vec_zero(ret, sizeof(*ret)); \
+\
+    /* top excess bits modulo target window size */ \
+    wbits = nbits % window; /* yes, it may be zero */ \
+    cbits = wbits + 1; \
+    while (bit0 -= wbits) { \
+        ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \
+                                      buckets, bit0, wbits, cbits); \
+        ptype##_dadd(ret, ret, tile, NULL); \
+        for (i = 0; i < window; i++) \
+            ptype##_double(ret, ret); \
+        cbits = wbits = window; \
+    } \
+    ptype##s_tile_pippenger(tile, points, npoints, scalars, nbits, \
+                                  buckets, 0, wbits, cbits); \
+    ptype##_dadd(ret, ret, tile, NULL); \
+} \
+\
+size_t prefix##s_mult_pippenger_scratch_sizeof(size_t npoints) \
+{   return sizeof(ptype##xyzz) << (pippenger_window_size(npoints)-1);   } \
+void prefix##s_tile_pippenger(ptype *ret, \
+                              const ptype##_affine *const points[], \
+                              size_t npoints, \
+                              const byte *const scalars[], size_t nbits, \
+                              ptype##xyzz scratch[], \
+                              size_t bit0, size_t window) \
+{ \
+    size_t wbits, cbits; \
+\
+    if (bit0 + window > nbits)  wbits = nbits - bit0, cbits = wbits + 1; \
+    else                        wbits = cbits = window; \
+    ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \
+                                 bit0, wbits, cbits); \
+} \
+void prefix##s_mult_pippenger(ptype *ret, \
+                              const ptype##_affine *const points[], \
+                              size_t npoints, \
+                              const byte *const scalars[], size_t nbits, \
+                              ptype##xyzz scratch[]) \
+{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); }
+
+DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384)
+POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp)
+POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp)
+POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p)
+POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1)
+
+DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x)
+POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2)
+POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2)
+POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
+POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2)
diff --git a/crypto/blst_src/no_asm.h b/crypto/blst_src/no_asm.h
new file mode 100644
index 00000000000..be7bf47e197
--- /dev/null
+++ b/crypto/blst_src/no_asm.h
@@ -0,0 +1,1345 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#if LIMB_T_BITS==32
+typedef unsigned long long llimb_t;
+#endif
+
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__)
+# error "unsupported compiler"
+#endif
+
+#if defined(__clang__)
+# pragma GCC diagnostic ignored "-Wstatic-in-inline"
+#endif
+
+#if !defined(__clang__) && !defined(__builtin_assume)
+# if defined(__GNUC__) && __GNUC__>=5
+#  define __builtin_assume(condition) if (!(condition)) __builtin_unreachable()
+# elif defined(_MSC_VER)
+#  define __builtin_assume(condition) __assume(condition)
+# else
+#  define __builtin_assume(condition) (void)(condition)
+# endif
+#endif
+
+static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                       const limb_t p[], limb_t n0, size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t mask, borrow, mx, hi, tmp[n+1], carry;
+    size_t i, j;
+
+    for (mx=b[0], hi=0, i=0; i<n; i++) {
+        limbx = (mx * (llimb_t)a[i]) + hi;
+        tmp[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    mx = n0*tmp[0];
+    tmp[i] = hi;
+
+    for (carry=0, j=0; ; ) {
+        limbx = (mx * (llimb_t)p[0]) + tmp[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        limbx = tmp[i] + (hi + (llimb_t)carry);
+        tmp[i-1] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+
+        if (++j==n)
+            break;
+
+        for (mx=b[j], hi=0, i=0; i<n; i++) {
+            limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
+            tmp[i] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        mx = n0*tmp[0];
+        limbx = hi + (llimb_t)carry;
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define MUL_MONT_IMPL(bits) \
+inline void mul_mont_##bits(vec##bits ret, const vec##bits a, \
+                            const vec##bits b, const vec##bits p, limb_t n0) \
+{   mul_mont_n(ret, a, b, p, n0, NLIMBS(bits));   } \
+\
+inline void sqr_mont_##bits(vec##bits ret, const vec##bits a, \
+                            const vec##bits p, limb_t n0) \
+{   mul_mont_n(ret, a, a, p, n0, NLIMBS(bits));   }
+
+/*
+ * 256-bit subroutines can handle arbitrary modulus, even non-"sparse",
+ * but we have to harmonize the naming with assembly.
+ */
+#define mul_mont_256 mul_mont_sparse_256
+#define sqr_mont_256 sqr_mont_sparse_256
+MUL_MONT_IMPL(256)
+#undef mul_mont_256
+#undef sqr_mont_256
+MUL_MONT_IMPL(384)
+
+static void add_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                      const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n];
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (b[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define ADD_MOD_IMPL(bits) \
+inline void add_mod_##bits(vec##bits ret, const vec##bits a, \
+                           const vec##bits b, const vec##bits p) \
+{   add_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+ADD_MOD_IMPL(256)
+ADD_MOD_IMPL(384)
+
+static void sub_mod_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                      const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t mask, carry, borrow;
+    size_t i;
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = a[i] - (b[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+    launder(mask);
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = ret[i] + ((p[i] & mask) + (llimb_t)carry);
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+}
+
+#define SUB_MOD_IMPL(bits) \
+inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \
+                           const vec##bits b, const vec##bits p) \
+{   sub_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+SUB_MOD_IMPL(256)
+SUB_MOD_IMPL(384)
+
+static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[],
+                           size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n], two_a[n];
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limb_t a_i = a[i];
+        tmp[i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        two_a[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        two_a[i] = (two_a[i] & ~mask) | (tmp[i] & mask);
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (two_a[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define MUL_BY_3_MOD_IMPL(bits) \
+inline void mul_by_3_mod_##bits(vec##bits ret, const vec##bits a, \
+                                const vec##bits p) \
+{   mul_by_3_mod_n(ret, a, p, NLIMBS(bits));   }
+
+MUL_BY_3_MOD_IMPL(256)
+MUL_BY_3_MOD_IMPL(384)
+
+static void lshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
+                         const limb_t p[], size_t n)
+{
+    __builtin_assume(count != 0);
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t mask, carry, borrow, tmp[n];
+    size_t i;
+
+    while (count--) {
+        for (carry=0, i=0; i<n; i++) {
+            limb_t a_i = a[i];
+            tmp[i] = a_i<<1 | carry;
+            carry = a_i>>(LIMB_T_BITS-1);
+        }
+
+        for (borrow=0, i=0; i<n; i++) {
+            limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+            ret[i] = (limb_t)limbx;
+            borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        }
+
+        mask = carry - borrow;
+        launder(mask);
+
+        for(i=0; i<n; i++)
+            ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+
+        a = ret;
+    }
+}
+
+#define LSHIFT_MOD_IMPL(bits) \
+inline void lshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
+                              const vec##bits p) \
+{   lshift_mod_n(ret, a, count, p, NLIMBS(bits));   }
+
+LSHIFT_MOD_IMPL(256)
+LSHIFT_MOD_IMPL(384)
+
+static void cneg_mod_n(limb_t ret[], const limb_t a[], bool_t flag,
+                       const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t borrow, mask, tmp[n];
+    size_t i;
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = p[i] - (a[i] + (llimb_t)borrow);
+        tmp[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    flag &= vec_is_zero(a, sizeof(tmp)) ^ 1;
+    mask = (limb_t)0 - flag;
+
+    for(i=0; i<n; i++)
+        ret[i] = (a[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define CNEG_MOD_IMPL(bits) \
+inline void cneg_mod_##bits(vec##bits ret, const vec##bits a, bool_t flag, \
+                            const vec##bits p) \
+{   cneg_mod_n(ret, a, flag, p, NLIMBS(bits));   }
+
+CNEG_MOD_IMPL(256)
+CNEG_MOD_IMPL(384)
+
+static limb_t check_mod_n(const byte a[], const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t borrow, ai, acc;
+    size_t i, j;
+
+    for (acc=borrow=0, i=0; i<n; i++) {
+        for (ai=0, j=0; j<8*sizeof(limb_t); j+=8)
+            ai |= (limb_t)(*a++) << j;
+        acc |= ai;
+        limbx = ai - (p[i] + (llimb_t)borrow);
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    return borrow & (is_zero(acc) ^ 1);
+}
+
+#define CHECK_MOD_IMPL(bits) \
+inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \
+{   return check_mod_n(a, p, NLIMBS(bits));   }
+
+CHECK_MOD_IMPL(256)
+
+static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[],
+                                            const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    limb_t ret_[n], a_[n], b_[n], zero;
+
+    limbs_from_le_bytes(a_, a, sizeof(a_));
+    limbs_from_le_bytes(b_, b, sizeof(b_));
+
+    add_mod_n(ret_, a_, b_, p, n);
+    zero = vec_is_zero(ret_, sizeof(ret_));
+
+    le_bytes_from_limbs(ret, ret_, sizeof(ret_));
+
+    return zero^1;
+}
+
+#define ADD_N_CHECK_MOD_IMPL(bits) \
+inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
+                                     const pow##bits b, const vec##bits p) \
+{   return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+ADD_N_CHECK_MOD_IMPL(256)
+
+static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[],
+                                            const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    limb_t ret_[n], a_[n], b_[n], zero;
+
+    limbs_from_le_bytes(a_, a, sizeof(a_));
+    limbs_from_le_bytes(b_, b, sizeof(b_));
+
+    sub_mod_n(ret_, a_, b_, p, n);
+    zero = vec_is_zero(ret_, sizeof(ret_));
+
+    le_bytes_from_limbs(ret, ret_, sizeof(ret_));
+
+    return zero^1;
+}
+
+#define SUB_N_CHECK_MOD_IMPL(bits) \
+inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \
+                                     const pow##bits b, const vec##bits p) \
+{   return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits));   }
+
+SUB_N_CHECK_MOD_IMPL(256)
+
+static void from_mont_n(limb_t ret[], const limb_t a[],
+                        const limb_t p[], limb_t n0, size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t mask, borrow, mx, hi, tmp[n];
+    size_t i, j;
+
+    for (j=0; j<n; j++) {
+        mx = n0*a[0];
+        limbx = (mx * (llimb_t)p[0]) + a[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + a[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = hi;
+        a = tmp;
+    }
+
+    /* this is needed only if input can be non-fully-reduced */
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define FROM_MONT_IMPL(bits) \
+inline void from_mont_##bits(vec##bits ret, const vec##bits a, \
+                             const vec##bits p, limb_t n0) \
+{   from_mont_n(ret, a, p, n0, NLIMBS(bits));   }
+
+FROM_MONT_IMPL(256)
+FROM_MONT_IMPL(384)
+
+static void redc_mont_n(limb_t ret[], const limb_t a[],
+                        const limb_t p[], limb_t n0, size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t mask, carry, borrow, mx, hi, tmp[n];
+    const limb_t *b = a;
+    size_t i, j;
+
+    for (j=0; j<n; j++) {
+        mx = n0*b[0];
+        limbx = (mx * (llimb_t)p[0]) + b[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + b[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = hi;
+        b = tmp;
+    }
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[n+i] + (tmp[i] + (llimb_t)carry);
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        ret[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = carry - borrow;
+    launder(mask);
+
+    for(i=0; i<n; i++)
+        ret[i] = (ret[i] & ~mask) | (tmp[i] & mask);
+}
+
+#define REDC_MONT_IMPL(bits, bits2) \
+inline void redc_mont_##bits(vec##bits ret, const vec##bits2 a, \
+                             const vec##bits p, limb_t n0) \
+{   redc_mont_n(ret, a, p, n0, NLIMBS(bits));   }
+
+REDC_MONT_IMPL(256, 512)
+REDC_MONT_IMPL(384, 768)
+
+static void rshift_mod_n(limb_t ret[], const limb_t a[], size_t count,
+                         const limb_t p[], size_t n)
+{
+    __builtin_assume(count != 0);
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t mask, carry, limb, next;
+    size_t i;
+
+    while (count--) {
+        mask = 0 - (a[0] & 1);
+        launder(mask);
+        for (carry=0, i=0; i<n; i++) {
+            limbx = a[i] + ((p[i]&mask) + (llimb_t)carry);
+            ret[i] = (limb_t)limbx;
+            carry = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+
+        for (next=ret[0], i=0; i<n-1; i++) {
+            limb = next >> 1;
+            next = ret[i+1];
+            ret[i] = limb | next << (LIMB_T_BITS-1);
+        }
+        ret[i] = next >> 1 | carry << (LIMB_T_BITS-1);
+
+        a = ret;
+    }
+}
+
+#define RSHIFT_MOD_IMPL(bits) \
+inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \
+                              const vec##bits p) \
+{   rshift_mod_n(ret, a, count, p, NLIMBS(bits));   }
+
+RSHIFT_MOD_IMPL(256)
+RSHIFT_MOD_IMPL(384)
+
+#define DIV_BY_2_MOD_IMPL(bits) \
+inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \
+                                const vec##bits p) \
+{   rshift_mod_n(ret, a, 1, p, NLIMBS(bits));   }
+
+DIV_BY_2_MOD_IMPL(384)
+
+static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t carry, borrow, ret, tmp[n];
+    size_t i;
+
+    ret = a[0] & 1; /* parity */
+
+    for (carry=0, i=0; i<n; i++) {
+        limb_t a_i = a[i];
+        tmp[i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    for (borrow=0, i=0; i<n; i++) {
+        limbx = tmp[i] - (p[i] + (llimb_t)borrow);
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    ret |= ((carry - borrow) & 2) ^ 2;
+
+    return ret;
+}
+
+inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p)
+{   return sgn0_pty_mod_n(a, p, NLIMBS(384));   }
+
+inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0)
+{
+    vec384 tmp;
+
+    from_mont_n(tmp, a, p, n0, NLIMBS(384));
+
+    return sgn0_pty_mod_n(tmp, p, NLIMBS(384));
+}
+
+inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p)
+{
+    limb_t re, im, sign, prty;
+
+    re = sgn0_pty_mod_n(a[0], p, NLIMBS(384));
+    im = sgn0_pty_mod_n(a[1], p, NLIMBS(384));
+
+    /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */
+    sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384));
+    sign = (re & sign) | (im & ~sign);
+
+    /* a->re==0 ? prty(a->im) : prty(a->re) */
+    prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384));
+    prty = (im & prty) | (re & ~prty);
+
+    return (sign & 2) | (prty & 1);
+}
+
+inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0)
+{
+    vec384x tmp;
+
+    from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384));
+    from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384));
+
+    return sgn0_pty_mod_384x(tmp, p);
+}
+
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                          const vec384 p, limb_t n0)
+{
+    vec384 aa, bb, cc;
+
+    add_mod_n(aa, a[0], a[1], p, NLIMBS(384));
+    add_mod_n(bb, b[0], b[1], p, NLIMBS(384));
+    mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384));
+    mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384));
+    mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384));
+    sub_mod_n(ret[0], aa, cc, p, NLIMBS(384));
+    sub_mod_n(ret[1], bb, aa, p, NLIMBS(384));
+    sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384));
+}
+
+/*
+ * mul_mont_n without final conditional subtraction, which implies
+ * that modulus is one bit short, which in turn means that there are
+ * no carries to handle between iterations...
+ */
+static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[],
+                              const limb_t p[], limb_t n0, size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t mx, hi, tmp[n+1];
+    size_t i, j;
+
+    for (mx=b[0], hi=0, i=0; i<n; i++) {
+        limbx = (mx * (llimb_t)a[i]) + hi;
+        tmp[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    mx = n0*tmp[0];
+    tmp[i] = hi;
+
+    for (j=0; ; ) {
+        limbx = (mx * (llimb_t)p[0]) + tmp[0];
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+        for (i=1; i<n; i++) {
+            limbx = (mx * (llimb_t)p[i] + hi) + tmp[i];
+            tmp[i-1] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        tmp[i-1] = tmp[i] + hi;
+
+        if (++j==n)
+            break;
+
+        for (mx=b[j], hi=0, i=0; i<n; i++) {
+            limbx = (mx * (llimb_t)a[i] + hi) + tmp[i];
+            tmp[i] = (limb_t)limbx;
+            hi = (limb_t)(limbx >> LIMB_T_BITS);
+        }
+        mx = n0*tmp[0];
+        tmp[i] = hi;
+    }
+
+    vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t));
+}
+
+void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b)
+{
+    __builtin_assume(count != 0);
+    while(count--) {
+        mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384));
+        a = ret;
+    }
+    mul_mont_n(ret, ret, b, p, n0, NLIMBS(384));
+}
+
+void sqr_mont_382x(vec384x ret, const vec384x a,
+                          const vec384 p, limb_t n0)
+{
+    llimb_t limbx;
+    limb_t mask, carry, borrow;
+    size_t i;
+    vec384 t0, t1;
+
+    /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limbx = a[0][i] + (a[1][i] + (llimb_t)carry);
+        t0[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */
+    for (borrow=0, i=0; i<NLIMBS(384); i++) {
+        limbx = a[0][i] - (a[1][i] + (llimb_t)borrow);
+        t1[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+    mask = 0 - borrow;
+    launder(mask);
+
+    /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */
+    mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));
+
+    /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limb_t a_i = ret[1][i];
+        ret[1][i] = a_i<<1 | carry;
+        carry = a_i>>(LIMB_T_BITS-1);
+    }
+
+    /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */
+    mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384));
+
+    /* account for t1's sign... */
+    for (borrow=0, i=0; i<NLIMBS(384); i++) {
+        limbx = ret[0][i] - ((t0[i] & mask) + (llimb_t)borrow);
+        ret[0][i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+    mask = 0 - borrow;
+    launder(mask);
+    for (carry=0, i=0; i<NLIMBS(384); i++) {
+        limbx = ret[0][i] + ((p[i] & mask) + (llimb_t)carry);
+        ret[0][i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+# define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; })
+#else
+# define MSB(x) ((x) >> (LIMB_T_BITS-1))
+#endif
+
+static size_t num_bits(limb_t l)
+{
+    limb_t x, mask;
+    size_t bits = is_zero(l) ^ 1;
+
+    if (sizeof(limb_t) == 8) {
+        x = l >> (32 & (8*sizeof(limb_t)-1));
+        mask = 0 - MSB(0 - x);
+        bits += 32 & mask;
+        l ^= (x ^ l) & mask;
+    }
+
+    x = l >> 16;
+    mask = 0 - MSB(0 - x);
+    bits += 16 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 8;
+    mask = 0 - MSB(0 - x);
+    bits += 8 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 4;
+    mask = 0 - MSB(0 - x);
+    bits += 4 & mask;
+    l ^= (x ^ l) & mask;
+
+    x = l >> 2;
+    mask = 0 - MSB(0 - x);
+    bits += 2 & mask;
+    l ^= (x ^ l) & mask;
+
+    bits += l >> 1;
+
+    return bits;
+}
+
+#if defined(__clang_major__) && __clang_major__>7
+__attribute__((optnone))
+#endif
+static limb_t lshift_2(limb_t hi, limb_t lo, size_t l)
+{
+    size_t r = LIMB_T_BITS - l;
+    limb_t mask = 0 - (is_zero(l)^1);
+    return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1)));
+}
+
+/*
+ * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1.
+ */
+static void ab_approximation_n(limb_t a_[2], const limb_t a[],
+                               limb_t b_[2], const limb_t b[], size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    limb_t a_hi, a_lo, b_hi, b_lo, mask;
+    size_t i;
+
+    i = n-1;
+    a_hi = a[i],    a_lo = a[i-1];
+    b_hi = b[i],    b_lo = b[i-1];
+    for (i--; --i;) {
+        mask = 0 - is_zero(a_hi | b_hi);
+        a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi;
+        b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi;
+        a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo;
+        b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo;
+    }
+    i = LIMB_T_BITS - num_bits(a_hi | b_hi);
+    /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */
+
+    a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i);
+    b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i);
+}
+
+typedef struct { limb_t f0, g0, f1, g1; } factors;
+
+static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2],
+                         size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
+    limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
+
+    a_lo = a_[0], a_hi = a_[1];
+    b_lo = b_[0], b_hi = b_[1];
+
+    while(n--) {
+        odd = 0 - (a_lo&1);
+
+        /* a_ -= b_ if a_ is odd */
+        t_lo = a_lo, t_hi = a_hi;
+        limbx = a_lo - (llimb_t)(b_lo & odd);
+        a_lo = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
+        a_hi = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS);
+
+        /* negate a_-b_ if it borrowed */
+        a_lo ^= borrow;
+        a_hi ^= borrow;
+        limbx = a_lo + (llimb_t)(borrow & 1);
+        a_lo = (limb_t)limbx;
+        a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
+
+        /* b_=a_ if a_-b_ borrowed */
+        b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
+        b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
+
+        /* exchange f0 and f1 if a_-b_ borrowed */
+        xorm = (f0 ^ f1) & borrow;
+        f0 ^= xorm;
+        f1 ^= xorm;
+
+        /* exchange g0 and g1 if a_-b_ borrowed */
+        xorm = (g0 ^ g1) & borrow;
+        g0 ^= xorm;
+        g1 ^= xorm;
+
+        /* subtract if a_ was odd */
+        f0 -= f1 & odd;
+        g0 -= g1 & odd;
+
+        f1 <<= 1;
+        g1 <<= 1;
+        a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
+        a_hi >>= 1;
+    }
+
+    fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1;
+}
+
+static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx = 0;
+    limb_t carry;
+    size_t i;
+
+    for (carry=neg&1, i=0; i<n; i++) {
+        limbx = (llimb_t)(a[i] ^ neg) + carry;
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return 0 - MSB((limb_t)limbx);
+}
+
+static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t carry;
+    size_t i;
+
+    for (carry=0, i=0; i<n; i++) {
+        limbx = a[i] + (b[i] + (llimb_t)carry);
+        ret[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return carry;
+}
+
+static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t hi;
+    size_t i;
+
+    for (hi=0, i=0; i<n; i++) {
+        limbx = (b * (llimb_t)a[i]) + hi;
+        ret[i] = (limb_t)limbx;
+        hi = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+
+    return hi;
+}
+
+static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_,
+                                           const limb_t b[], limb_t *g_,
+                                           size_t n)
+{
+    __builtin_assume(n != 0);
+    limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi;
+    size_t i;
+
+    /* |a|*|f_| */
+    f = *f_;
+    neg = 0 - MSB(f);
+    f = (f ^ neg) - neg;            /* ensure |f| is positive */
+    (void)cneg_n(a_, a, neg, n);
+    hi = umul_n(a_, a_, f, n);
+    a_[n] = hi - (f & neg);
+
+    /* |b|*|g_| */
+    g = *g_;
+    neg = 0 - MSB(g);
+    g = (g ^ neg) - neg;            /* ensure |g| is positive */
+    (void)cneg_n(b_, b, neg, n);
+    hi = umul_n(b_, b_, g, n);
+    b_[n] = hi - (g & neg);
+
+    /* |a|*|f_| + |b|*|g_| */
+    (void)add_n(a_, a_, b_, n+1);
+
+    /* (|a|*|f_| + |b|*|g_|) >> k */
+    for (carry=a_[0], i=0; i<n; i++) {
+        hi = carry >> (LIMB_T_BITS-2);
+        carry = a_[i+1];
+        ret[i] = hi | (carry << 2);
+    }
+
+    /* ensure result is non-negative, fix up |f_| and |g_| accordingly */
+    neg = 0 - MSB(carry);
+    *f_ = (*f_ ^ neg) - neg;
+    *g_ = (*g_ ^ neg) - neg;
+    (void)cneg_n(ret, ret, neg, n);
+
+    return neg;
+}
+
+static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f,
+                                    const limb_t v[], limb_t g, size_t n)
+{
+    __builtin_assume(n != 0);
+    limb_t u_[n], v_[n], neg, hi;
+
+    /* |u|*|f_| */
+    neg = 0 - MSB(f);
+    f = (f ^ neg) - neg;            /* ensure |f| is positive */
+    neg = cneg_n(u_, u, neg, n);
+    hi = umul_n(u_, u_, f, n) - (f&neg);
+
+    /* |v|*|g_| */
+    neg = 0 - MSB(g);
+    g = (g ^ neg) - neg;            /* ensure |g| is positive */
+    neg = cneg_n(v_, v, neg, n);
+    hi += umul_n(v_, v_, g, n) - (g&neg);
+
+    /* |u|*|f_| + |v|*|g_| */
+    hi += add_n(ret, u_, v_, n);
+
+    return hi;
+}
+
+static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[],
+                             const limb_t mod[], const limb_t modx[], size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t a[n], b[n], u[2*n], v[2*n], t[2*n];
+    limb_t a_[2], b_[2], sign, carry, top;
+    factors fg;
+    size_t i;
+
+    vec_copy(a, inp, sizeof(a));
+    vec_copy(b, mod, sizeof(b));
+    vec_zero(u, sizeof(u)); u[0] = 1;
+    vec_zero(v, sizeof(v));
+
+    for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
+        ab_approximation_n(a_, a, b_, b, n);
+        inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2);
+        (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
+        (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
+        vec_copy(a, t, sizeof(a));
+        smul_2n(t, u, fg.f0, v, fg.g0, 2*n);
+        smul_2n(v, u, fg.f1, v, fg.g1, 2*n);
+        vec_copy(u, t, sizeof(u));
+    }
+
+    inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
+    top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n);
+
+    sign = 0 - MSB(top);    /* top is 1, 0 or -1 */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = ret[n+i] + ((modx[i] & sign) + (llimb_t)carry);
+        ret[n+i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    top += carry;
+    sign = 0 - top;         /* top is 1, 0 or -1 */
+    top |= sign;
+    for (i=0; i<n; i++)
+        a[i] = modx[i] & top;
+    (void)cneg_n(a, a, 0 - MSB(sign), n);
+    add_n(ret+n, ret+n, a, n);
+}
+
+#define CT_INVERSE_MOD_IMPL(bits, bits2) \
+inline void ct_inverse_mod_##bits(vec##bits2 ret, const vec##bits inp, \
+                                  const vec##bits mod, const vec##bits modx) \
+{   ct_inverse_mod_n(ret, inp, mod, modx, NLIMBS(bits));   }
+
+CT_INVERSE_MOD_IMPL(256, 512)
+CT_INVERSE_MOD_IMPL(384, 768)
+
+/*
+ * Copy of inner_loop_n above, but with |L| updates.
+ */
+static limb_t legendre_loop_n(limb_t L, factors *fg, const limb_t a_[2],
+                              const limb_t b_[2], size_t n)
+{
+    __builtin_assume(n != 0);
+    llimb_t limbx;
+    limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1;
+    limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm;
+
+    a_lo = a_[0], a_hi = a_[1];
+    b_lo = b_[0], b_hi = b_[1];
+
+    while(n--) {
+        odd = 0 - (a_lo&1);
+
+        /* a_ -= b_ if a_ is odd */
+        t_lo = a_lo, t_hi = a_hi;
+        limbx = a_lo - (llimb_t)(b_lo & odd);
+        a_lo = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+        limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow);
+        a_hi = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS);
+
+        L += ((t_lo & b_lo) >> 1) & borrow;
+
+        /* negate a_-b_ if it borrowed */
+        a_lo ^= borrow;
+        a_hi ^= borrow;
+        limbx = a_lo + (llimb_t)(borrow & 1);
+        a_lo = (limb_t)limbx;
+        a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1;
+
+        /* b_=a_ if a_-b_ borrowed */
+        b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo;
+        b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi;
+
+        /* exchange f0 and f1 if a_-b_ borrowed */
+        xorm = (f0 ^ f1) & borrow;
+        f0 ^= xorm;
+        f1 ^= xorm;
+
+        /* exchange g0 and g1 if a_-b_ borrowed */
+        xorm = (g0 ^ g1) & borrow;
+        g0 ^= xorm;
+        g1 ^= xorm;
+
+        /* subtract if a_ was odd */
+        f0 -= f1 & odd;
+        g0 -= g1 & odd;
+
+        f1 <<= 1;
+        g1 <<= 1;
+        a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1);
+        a_hi >>= 1;
+
+        L += (b_lo + 2) >> 2;
+    }
+
+    fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1;
+
+    return L;
+}
+
+static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    limb_t a[n], b[n], t[n];
+    limb_t a_[2], b_[2], neg, L = 0;
+    factors fg;
+    size_t i;
+
+    vec_copy(a, inp, sizeof(a));
+    vec_copy(b, mod, sizeof(b));
+
+    for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) {
+        ab_approximation_n(a_, a, b_, b, n);
+        L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2);
+        neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n);
+        (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n);
+        vec_copy(a, t, sizeof(a));
+        L += (b[0] >> 1) & neg;
+    }
+
+    L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2));
+
+    return (L & 1) ^ 1;
+}
+
+#define CT_IS_SQR_MOD_IMPL(bits) \
+inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \
+                                      const vec##bits mod) \
+{   return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits));   }
+
+CT_IS_SQR_MOD_IMPL(384)
+
+/*
+ * |div_top| points at two most significant limbs of the dividend, |d_hi|
+ * and |d_lo| are two most significant limbs of the divisor. If divisor
+ * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
+ * The divisor is required to be "bitwise left-aligned," and dividend's
+ * top limbs to be not larger than the divisor's. The latter limitation
+ * can be problematic in the first iteration of multi-precision division,
+ * where in most general case the condition would have to be "smaller."
+ * The subroutine considers four limbs, two of which are "overlapping,"
+ * hence the name... Another way to look at it is to think of the pair
+ * of the dividend's limbs being suffixed with a zero:
+ *   +-------+-------+-------+
+ * R |       |       |   0   |
+ *   +-------+-------+-------+
+ *           +-------+-------+
+ * D         |       |       |
+ *           +-------+-------+
+ */
+limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi)
+{
+    llimb_t Rx;
+    limb_t r_lo = div_top[0], r_hi = div_top[1];
+    limb_t Q = 0, mask, borrow, rx;
+    size_t i;
+
+    for (i = 0; i < LIMB_T_BITS; i++) {
+        /* "borrow, Rx = R - D" */
+        Rx = (llimb_t)r_lo - d_lo;
+        rx = (limb_t)Rx;
+        borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+        Rx = r_hi - (d_hi + (llimb_t)borrow);
+        borrow = (limb_t)(Rx >> LIMB_T_BITS);
+
+        /* "if (R >= D) R -= D" */
+        r_lo = ((r_lo ^ rx) & borrow) ^ rx;
+        rx = (limb_t)Rx;
+        r_hi = ((r_hi ^ rx) & borrow) ^ rx;
+
+        Q <<= 1;
+        Q |= ~borrow & 1;
+
+        /* "D >>= 1" */
+        d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1);
+        d_hi >>= 1;
+    }
+
+    mask = 0 - MSB(Q);  /* does it overflow? */
+
+    /* "borrow, Rx = R - D" */
+    Rx = (llimb_t)r_lo - d_lo;
+    rx = (limb_t)Rx;
+    borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+    Rx = r_hi - (d_hi + (llimb_t)borrow);
+    borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1;
+
+    Q <<= 1;
+    Q |= borrow ^ 1;
+
+    return (Q | mask);
+}
+
+static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor,
+                                          limb_t quotient, size_t n)
+{
+    __builtin_assume(n != 0 && n%2 == 0);
+    llimb_t limbx;
+    limb_t tmp[n+1], carry, mask, borrow;
+    size_t i;
+
+    /* divisor*quotient */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = (quotient * (llimb_t)divisor[i]) + carry;
+        tmp[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS);
+    }
+    tmp[i] = carry;
+
+    /* remainder = dividend - divisor*quotient */
+    for (borrow=0, i=0; i<=n; i++) {
+        limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow);
+        tmp[i] = (limb_t)limbx;
+        borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    mask = 0 - borrow;
+    launder(mask);
+
+    /* if quotient was off by one, add divisor to the remainder */
+    for (carry=0, i=0; i<n; i++) {
+        limbx = tmp[i] + ((divisor[i] & mask) + (llimb_t)carry);
+        div_rem[i] = (limb_t)limbx;
+        carry = (limb_t)(limbx >> LIMB_T_BITS) & 1;
+    }
+
+    return (div_rem[i] = quotient + mask);
+}
+
+inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor,
+                                            limb_t quotient)
+{   return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128));   }
+
+inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor,
+                                           limb_t quotient)
+{   return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64));   }
+
+/*
+ * Unlock reference implementations in vect.c
+ */
+#define mul_by_8_mod_384 mul_by_8_mod_384
+#define mul_by_8_mod_384x mul_by_8_mod_384x
+#define mul_by_3_mod_384x mul_by_3_mod_384x
+#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x
+#define add_mod_384x add_mod_384x
+#define sub_mod_384x sub_mod_384x
+#define lshift_mod_384x lshift_mod_384x
+#define sqr_mont_384x sqr_mont_384x
+
+inline void vec_prefetch(const void *ptr, size_t len)
+{   (void)ptr; (void)len;   }
+
+/*
+ * SHA-256
+ */
+#define ROTR(x,n)	((x)>>n | (x)<<(32-n))
+#define Sigma0(x)	(ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22))
+#define Sigma1(x)	(ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25))
+#define sigma0(x)	(ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3))
+#define sigma1(x)	(ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10))
+#define Ch(x,y,z)	(((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x,y,z)	(((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+void blst_sha256_block_data_order(unsigned int *v, const void *inp,
+                                                   size_t blocks)
+{
+    static const unsigned int K256[64] = {
+        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+    };
+    unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2;
+    const unsigned char *data = inp;
+    size_t round;
+
+    a = v[0];
+    b = v[1];
+    c = v[2];
+    d = v[3];
+    e = v[4];
+    f = v[5];
+    g = v[6];
+    h = v[7];
+
+    while (blocks--) {
+        for (round = 0; round < 16; round++) {
+            l  = (unsigned int)data[0] << 24;
+            l |= (unsigned int)data[1] << 16;
+            l |= (unsigned int)data[2] << 8;
+            l |= (unsigned int)data[3];
+            data += 4;
+            T1 = X[round] = l;
+            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
+            T2 = Sigma0(a) + Maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+        }
+
+        for (; round < 64; round++) {
+            s0 = X[(round + 1) & 0x0f];
+            s0 = sigma0(s0);
+            s1 = X[(round + 14) & 0x0f];
+            s1 = sigma1(s1);
+
+            T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf];
+            T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round];
+            T2 = Sigma0(a) + Maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+        }
+
+        a += v[0]; v[0] = a;
+        b += v[1]; v[1] = b;
+        c += v[2]; v[2] = c;
+        d += v[3]; v[3] = d;
+        e += v[4]; v[4] = e;
+        f += v[5]; v[5] = f;
+        g += v[6]; v[6] = g;
+        h += v[7]; v[7] = h;
+    }
+}
+#undef ROTR
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+
+void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8])
+{
+    size_t i;
+
+    for (i=0; i<8; i++)
+        dst[i] = src[i];
+}
+
+void blst_sha256_emit(unsigned char md[32], const unsigned int h[8])
+{
+    size_t i;
+
+    for (i=0; i<8; i++, md+=4) {
+        unsigned int h_i = h[i];
+        md[0] = (unsigned char)(h_i >> 24);
+        md[1] = (unsigned char)(h_i >> 16);
+        md[2] = (unsigned char)(h_i >> 8);
+        md[3] = (unsigned char)h_i;
+    }
+}
+
+void blst_sha256_bcopy(void *dst_, const void *src_, size_t len)
+{
+    unsigned char *dst = dst_;
+    const unsigned char *src = src_;
+    size_t i;
+
+    for (i=0; i<len; i++)
+        dst[i] = src[i];
+}
diff --git a/crypto/blst_src/pairing.c b/crypto/blst_src/pairing.c
new file mode 100644
index 00000000000..b256c44d68a
--- /dev/null
+++ b/crypto/blst_src/pairing.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "point.h"
+#include "fields.h"
+
+/*
+ * Line evaluations from  https://eprint.iacr.org/2010/354.pdf
+ * with a twist moving common expression to line_by_Px2.
+ */
+static void line_add(vec384fp6 line, POINTonE2 *T, const POINTonE2 *R,
+                                                   const POINTonE2_affine *Q)
+{
+    vec384x Z1Z1, U2, S2, H, HH, I, J, V;
+#if 1
+# define r line[1]
+#else
+    vec384x r;
+#endif
+
+    /*
+     * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl
+     * with XYZ3 being |T|, XYZ1 - |R|, XY2 - |Q|, i.e. Q is affine
+     */
+    sqr_fp2(Z1Z1, R->Z);                /* Z1Z1 = Z1^2 */
+    mul_fp2(U2, Q->X, Z1Z1);            /* U2 = X2*Z1Z1 */
+
+    mul_fp2(S2, Q->Y, R->Z);
+    mul_fp2(S2, S2, Z1Z1);              /* S2 = Y2*Z1*Z1Z1 */
+
+    sub_fp2(H, U2, R->X);               /* H = U2-X1 */
+
+    sqr_fp2(HH, H);                     /* HH = H^2 */
+    add_fp2(I, HH, HH);
+    add_fp2(I, I, I);                   /* I = 4*HH */
+
+    mul_fp2(J, H, I);                   /* J = H*I */
+
+    sub_fp2(r, S2, R->Y);
+    add_fp2(r, r, r);                   /* r = 2*(S2-Y1) */
+
+    mul_fp2(V, R->X, I);                /* V = X1*I */
+
+    sqr_fp2(T->X, r);
+    sub_fp2(T->X, T->X, J);
+    sub_fp2(T->X, T->X, V);
+    sub_fp2(T->X, T->X, V);             /* X3 = r^2-J-2*V */
+
+    mul_fp2(J, J, R->Y);
+    sub_fp2(T->Y, V, T->X);
+    mul_fp2(T->Y, T->Y, r);
+    sub_fp2(T->Y, T->Y, J);
+    sub_fp2(T->Y, T->Y, J);             /* Y3 = r*(V-X3)-2*Y1*J */
+
+    add_fp2(T->Z, R->Z, H);
+    sqr_fp2(T->Z, T->Z);
+    sub_fp2(T->Z, T->Z, Z1Z1);
+    sub_fp2(T->Z, T->Z, HH);            /* Z3 = (Z1+H)^2-Z1Z1-HH */
+
+    /*
+     * line evaluation
+     */
+    mul_fp2(I, r, Q->X);
+    mul_fp2(J, Q->Y, T->Z);
+    sub_fp2(I, I, J);
+    add_fp2(line[0], I, I);          /* 2*(r*X2 - Y2*Z3) */
+#ifdef r
+# undef r
+#else
+    vec_copy(line[1], r, sizeof(r));
+#endif
+    vec_copy(line[2], T->Z, sizeof(T->Z));
+}
+
+static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q)
+{
+    vec384x ZZ, A, B, C, D, E, F;
+
+    /*
+     * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr
+     */
+    sqr_fp2(A, Q->X);                   /* A = X1^2 */
+    sqr_fp2(B, Q->Y);                   /* B = Y1^2 */
+    sqr_fp2(ZZ, Q->Z);                  /* ZZ = Z1^2 */
+    sqr_fp2(C, B);                      /* C = B^2 */
+
+    add_fp2(D, Q->X, B);                /* X1+B */
+    sqr_fp2(D, D);                      /* (X1+B)^2 */
+    sub_fp2(D, D, A);                   /* (X1+B)^2-A */
+    sub_fp2(D, D, C);                   /* (X1+B)^2-A-C */
+    add_fp2(D, D, D);                   /* D = 2*((X1+B)^2-A-C) */
+
+    mul_by_3_fp2(E, A);                 /* E = 3*A */
+    sqr_fp2(F, E);                      /* F = E^2 */
+
+    add_fp2(line[0], E, Q->X);          /* 3*A+X1 for line evaluation */
+
+    sub_fp2(T->X, F, D);
+    sub_fp2(T->X, T->X, D);             /* X3 = F-2*D */
+
+    add_fp2(T->Z, Q->Y, Q->Z);
+    sqr_fp2(T->Z, T->Z);
+    sub_fp2(T->Z, T->Z, B);
+    sub_fp2(T->Z, T->Z, ZZ);            /* Z3 = (Y1+Z1)^2-B-ZZ */
+
+    mul_by_8_fp2(C, C);                 /* 8*C */
+    sub_fp2(T->Y, D, T->X);             /* D-X3 */
+    mul_fp2(T->Y, T->Y, E);             /* E*(D-X3) */
+    sub_fp2(T->Y, T->Y, C);             /* Y3 = E*(D-X3)-8*C */
+
+    /*
+     * line evaluation
+     */
+    sqr_fp2(line[0], line[0]);
+    sub_fp2(line[0], line[0], A);
+    sub_fp2(line[0], line[0], F);       /* (3*A+X1)^2 - X1^2 - 9*A^2 */
+    lshift_fp2(B, B, 2);
+    sub_fp2(line[0], line[0], B);       /* 6*X1^3 - 4*Y1^2 */
+
+    mul_fp2(line[1], E, ZZ);            /* 3*X1^2 * Z1^2 */
+
+    mul_fp2(line[2], T->Z, ZZ);         /* Z3 * Z1^2 */
+}
+
+static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2)
+{
+    mul_fp(line[1][0], line[1][0], Px2->X);   /* "b01" *= -2*P->X */
+    mul_fp(line[1][1], line[1][1], Px2->X);
+
+    mul_fp(line[2][0], line[2][0], Px2->Y);   /* "b11" *= 2*P->Y */
+    mul_fp(line[2][1], line[2][1], Px2->Y);
+}
+
+#if 0
+static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q,
+                      const POINTonE1_affine *Px2, vec384fp6 line, size_t n)
+{
+    line_add(line, T, T, Q);    line_by_Px2(line, Px2);
+    mul_by_xy00z0_fp12(ret, ret, line);
+    while (n--) {
+        sqr_fp12(ret, ret);
+        line_dbl(line, T, T);   line_by_Px2(line, Px2);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P)
+{
+#define Q ((const POINTonE2_affine *)Q)
+    POINTonE2 T[1];
+    POINTonE1_affine Px2[1];
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* Move common expression from line evaluation to line_by_Px2. */
+    add_fp(Px2->X, P->X, P->X);
+    neg_fp(Px2->X, Px2->X);
+    add_fp(Px2->Y, P->Y, P->Y);
+
+    vec_copy(T->X, Q->X, 2*sizeof(T->X));
+    vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z));
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    line_dbl(line, T, T);                       /* 0x2                  */
+    line_by_Px2(line, Px2);
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+    add_n_dbl(ret, T, Q, Px2, line, 2);         /* ..0xc                */
+    add_n_dbl(ret, T, Q, Px2, line, 3);         /* ..0x68               */
+    add_n_dbl(ret, T, Q, Px2, line, 9);         /* ..0xd200             */
+    add_n_dbl(ret, T, Q, Px2, line, 32);        /* ..0xd20100000000     */
+    add_n_dbl(ret, T, Q, Px2, line, 16);        /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+#undef Q
+}
+#endif
+
+static void start_dbl_n(vec384fp12 ret, POINTonE2 T[],
+                                        const POINTonE1_affine Px2[], size_t n)
+{
+    size_t i;
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    line_dbl(line, T+0, T+0);           line_by_Px2(line, Px2+0);
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+
+    for (i = 1; i < n; i++) {
+        line_dbl(line, T+i, T+i);       line_by_Px2(line, Px2+i);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[],
+                                        const POINTonE2_affine Q[],
+                                        const POINTonE1_affine Px2[],
+                                        size_t n, size_t k)
+{
+    size_t i;
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    for (i = 0; i < n; i++) {
+        line_add(line, T+i, T+i, Q+i);  line_by_Px2(line, Px2+i);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+    while (k--) {
+        sqr_fp12(ret, ret);
+        for (i = 0; i < n; i++) {
+            line_dbl(line, T+i, T+i);   line_by_Px2(line, Px2+i);
+            mul_by_xy00z0_fp12(ret, ret, line);
+        }
+    }
+}
+
+static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[],
+                                          const POINTonE1_affine P[], size_t n)
+{
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \
+                               || defined(__STDC_NO_VLA__)
+    POINTonE2 *T = alloca(n*sizeof(POINTonE2));
+    POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine));
+#else
+    POINTonE2 T[n];
+    POINTonE1_affine Px2[n];
+#endif
+    size_t i;
+
+    if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) |
+                     vec_is_zero(&P[0], sizeof(P[0]))) ) {
+        /*
+         * Special case of infinite aggregated signature, pair the additive
+         * group's identity with the multiplicative group's identity.
+         */
+        vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12));
+        return;
+    }
+
+    for (i = 0; i < n; i++) {
+        /* Move common expression from line evaluation to line_by_Px2.  */
+        add_fp(Px2[i].X, P[i].X, P[i].X);
+        neg_fp(Px2[i].X, Px2[i].X);
+        add_fp(Px2[i].Y, P[i].Y, P[i].Y);
+
+        vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X));
+        vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z));
+    }
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    start_dbl_n(ret, T, Px2, n);                /* 0x2                  */
+    add_n_dbl_n(ret, T, Q, Px2, n, 2);          /* ..0xc                */
+    add_n_dbl_n(ret, T, Q, Px2, n, 3);          /* ..0x68               */
+    add_n_dbl_n(ret, T, Q, Px2, n, 9);          /* ..0xd200             */
+    add_n_dbl_n(ret, T, Q, Px2, n, 32);         /* ..0xd20100000000     */
+    add_n_dbl_n(ret, T, Q, Px2, n, 16);         /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T,
+                                             const POINTonE2_affine *Q,
+                                             size_t n)
+{
+    line_add(lines++[0], T, T, Q);
+    while (n--)
+        line_dbl(lines++[0], T, T);
+}
+
+static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q)
+{
+    POINTonE2 T[1];
+
+    vec_copy(T->X, Q->X, 2*sizeof(T->X));
+    vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z));
+
+    line_dbl(Qlines[0], T, T);                  /* 0x2                  */
+    pre_add_n_dbl(&Qlines[1],  T, Q, 2);        /* ..0xc                */
+    pre_add_n_dbl(&Qlines[4],  T, Q, 3);        /* ..0x68               */
+    pre_add_n_dbl(&Qlines[8],  T, Q, 9);        /* ..0xd200             */
+    pre_add_n_dbl(&Qlines[18], T, Q, 32);       /* ..0xd20100000000     */
+    pre_add_n_dbl(&Qlines[51], T, Q, 16);       /* ..0xd201000000010000 */
+}
+
+static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in,
+                                            const POINTonE1_affine *Px2)
+{
+    vec_copy(out[0], in[0], sizeof(out[0]));
+
+    mul_fp(out[1][0], in[1][0], Px2->X);        /* "b01" *= -2*P->X */
+    mul_fp(out[1][1], in[1][1], Px2->X);
+
+    mul_fp(out[2][0], in[2][0], Px2->Y);        /* "b11" *= 2*P->Y */
+    mul_fp(out[2][1], in[2][1], Px2->Y);
+}
+
+static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[],
+                           const POINTonE1_affine *Px2, size_t n)
+{
+    vec384fp6 line;
+
+    post_line_by_Px2(line, lines++[0], Px2);
+    mul_by_xy00z0_fp12(ret, ret, line);
+    while (n--) {
+        sqr_fp12(ret, ret);
+        post_line_by_Px2(line, lines++[0], Px2);
+        mul_by_xy00z0_fp12(ret, ret, line);
+    }
+}
+
+static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68],
+                                              const POINTonE1_affine *P)
+{
+    POINTonE1_affine Px2[1];
+    vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0"  */
+
+    /* Move common expression from line evaluation to line_by_Px2. */
+    add_fp(Px2->X, P->X, P->X);
+    neg_fp(Px2->X, Px2->X);
+    add_fp(Px2->Y, P->Y, P->Y);
+
+    /* first step is ret = 1^2*line, which is replaced with ret = line  */
+    post_line_by_Px2(line, Qlines[0], Px2);     /* 0x2                  */
+    vec_zero(ret, sizeof(vec384fp12));
+    vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2));
+    vec_copy(ret[1][1], line[2], sizeof(vec384fp2));
+    post_add_n_dbl(ret, &Qlines[1],  Px2, 2);   /* ..0xc                */
+    post_add_n_dbl(ret, &Qlines[4],  Px2, 3);   /* ..0x68               */
+    post_add_n_dbl(ret, &Qlines[8],  Px2, 9);   /* ..0xd200             */
+    post_add_n_dbl(ret, &Qlines[18], Px2, 32);  /* ..0xd20100000000     */
+    post_add_n_dbl(ret, &Qlines[51], Px2, 16);  /* ..0xd201000000010000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+#ifdef INTERNAL_TESTMODE
+static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q,
+                                            const POINTonE1_affine *P)
+{
+    vec384fp6 lines[68];
+
+    precompute_lines(lines, Q);
+    miller_loop_lines(ret, lines, P);
+}
+#endif
+
+static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n)
+{
+    mul_fp12(ret, ret, a);
+    while (n--)
+        cyclotomic_sqr_fp12(ret, ret);
+}
+
+static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a)
+{
+    cyclotomic_sqr_fp12(ret, a);                /* 0x2                  */
+    mul_n_sqr(ret, a, 2);                       /* ..0xc                */
+    mul_n_sqr(ret, a, 3);                       /* ..0x68               */
+    mul_n_sqr(ret, a, 9);                       /* ..0xd200             */
+    mul_n_sqr(ret, a, 32);                      /* ..0xd20100000000     */
+    mul_n_sqr(ret, a, 16-1);                    /* ..0x6900800000008000 */
+    conjugate_fp12(ret);                /* account for z being negative */
+}
+
+#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a))
+
+/*
+ * Adaptation from <zkcrypto>/pairing/src/bls12_381/mod.rs
+ */
+static void final_exp(vec384fp12 ret, const vec384fp12 f)
+{
+    vec384fp12 y0, y1, y2, y3;
+
+    vec_copy(y1, f, sizeof(y1));
+    conjugate_fp12(y1);
+    inverse_fp12(y2, f);
+    mul_fp12(ret, y1, y2);
+    frobenius_map_fp12(y2, ret, 2);
+    mul_fp12(ret, ret, y2);
+
+    cyclotomic_sqr_fp12(y0, ret);
+    raise_to_z(y1, y0);
+    raise_to_z_div_by_2(y2, y1);
+    vec_copy(y3, ret, sizeof(y3));
+    conjugate_fp12(y3);
+    mul_fp12(y1, y1, y3);
+    conjugate_fp12(y1);
+    mul_fp12(y1, y1, y2);
+    raise_to_z(y2, y1);
+    raise_to_z(y3, y2);
+    conjugate_fp12(y1);
+    mul_fp12(y3, y3, y1);
+    conjugate_fp12(y1);
+    frobenius_map_fp12(y1, y1, 3);
+    frobenius_map_fp12(y2, y2, 2);
+    mul_fp12(y1, y1, y2);
+    raise_to_z(y2, y3);
+    mul_fp12(y2, y2, y0);
+    mul_fp12(y2, y2, ret);
+    mul_fp12(y1, y1, y2);
+    frobenius_map_fp12(y2, y3, 1);
+    mul_fp12(ret, y1, y2);
+}
+
+void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q,
+                                      const POINTonE1_affine *P)
+{   miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2,
+                       P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1);
+}
+
+void blst_final_exp(vec384fp12 ret, const vec384fp12 f)
+{   final_exp(ret, f);   }
+
+void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q)
+{   precompute_lines(Qlines, Q);   }
+
+void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68],
+                                            const POINTonE1_affine *P)
+{   miller_loop_lines(ret, Qlines, P);   }
+
+static bool_t is_cyclotomic(const vec384fp12 f)
+{
+    vec384fp12 a, b;
+
+    frobenius_map_fp12(a, f, 2);
+    frobenius_map_fp12(b, a, 2);
+    mul_fp12(b, b, f);
+
+    return vec_is_equal(a, b, sizeof(a));
+}
+
+int blst_fp12_in_group(const vec384fp12 f)
+{
+    vec384fp12 a, b;
+
+    if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f))
+        return 0;
+
+    frobenius_map_fp12(a, f, 1);
+    raise_to_z(b, f);
+
+    return (int)vec_is_equal(a, b, sizeof(a));
+}
diff --git a/crypto/blst_src/pentaroot-addchain.h b/crypto/blst_src/pentaroot-addchain.h
new file mode 100644
index 00000000000..5bdd9ddf7f7
--- /dev/null
+++ b/crypto/blst_src/pentaroot-addchain.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which
+ * yields 5th root of the base.
+ *
+ * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805'
+ * https://github.com/kwantam/addchain
+ * # Bos-Coster (win=4)           :  307 (15)
+ * # Bos-Coster (win=10)          :  307 (18)
+ * # Yacobi                       :  319 (16)
+ * # Bos-Coster (win=2)           :  319 ( 5)
+ * # Bos-Coster (win=5)           :  306 (19) <<<
+ * # Bos-Coster (win=7)           :  311 (22)
+ * # Bos-Coster (win=9)           :  313 (20)
+ * # Bos-Coster (win=3)           :  314 ( 9)
+ * # Bos-Coster (win=6)           :  309 (21)
+ * # Bos-Coster (win=8)           :  309 (23)
+ * # Bergeron-Berstel-Brlek-Duboc :  334 ( 5)
+ */
+
+#define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \
+ptype t[19]; \
+vec_copy(t[1], inp, sizeof(ptype)); /*    0: 1 */\
+sqr(t[7], t[1]);                    /*    1: 2 */\
+sqr(t[0], t[7]);                    /*    2: 4 */\
+sqr(t[2], t[0]);                    /*    3: 8 */\
+mul(t[10], t[2], t[1]);             /*    4: 9 */\
+mul(t[3], t[10], t[7]);             /*    5: b */\
+mul(t[1], t[10], t[0]);             /*    6: d */\
+mul(t[5], t[3], t[0]);              /*    7: f */\
+mul(t[9], t[10], t[2]);             /*    8: 11 */\
+mul(t[4], t[3], t[2]);              /*    9: 13 */\
+mul(t[15], t[5], t[2]);             /*   10: 17 */\
+mul(t[8], t[15], t[2]);             /*   11: 1f */\
+mul(t[13], t[8], t[7]);             /*   12: 21 */\
+mul(t[14], t[8], t[0]);             /*   13: 23 */\
+mul(t[12], t[13], t[0]);            /*   14: 25 */\
+mul(t[6], t[8], t[2]);              /*   15: 27 */\
+mul(t[11], t[14], t[2]);            /*   16: 2b */\
+sqr(t[0], t[15]);                   /*   17: 2e */\
+mul(t[18], t[6], t[2]);             /*   18: 2f */\
+mul(t[2], t[11], t[2]);             /*   19: 33 */\
+mul(t[16], t[2], t[7]);             /*   20: 35 */\
+mul(t[7], t[0], t[3]);              /*   21: 39 */\
+mul(t[17], t[0], t[5]);             /*   22: 3d */\
+/* sqr(t[0], t[0]); */              /*   23: 5c */\
+/* sqr(t[0], t[0]); */              /*   24: b8 */\
+/* sqr(t[0], t[0]); */              /*   25: 170 */\
+/* sqr(t[0], t[0]); */              /*   26: 2e0 */\
+/* sqr(t[0], t[0]); */              /*   27: 5c0 */\
+/* sqr(t[0], t[0]); */              /*   28: b80 */\
+/* sqr(t[0], t[0]); */              /*   29: 1700 */\
+sqr_n_mul(t[0], t[0], 7, t[18]);    /*   30: 172f */\
+/* sqr(t[0], t[0]); */              /*   31: 2e5e */\
+/* sqr(t[0], t[0]); */              /*   32: 5cbc */\
+/* sqr(t[0], t[0]); */              /*   33: b978 */\
+/* sqr(t[0], t[0]); */              /*   34: 172f0 */\
+/* sqr(t[0], t[0]); */              /*   35: 2e5e0 */\
+/* sqr(t[0], t[0]); */              /*   36: 5cbc0 */\
+sqr_n_mul(t[0], t[0], 6, t[13]);    /*   37: 5cbe1 */\
+/* sqr(t[0], t[0]); */              /*   38: b97c2 */\
+/* sqr(t[0], t[0]); */              /*   39: 172f84 */\
+/* sqr(t[0], t[0]); */              /*   40: 2e5f08 */\
+/* sqr(t[0], t[0]); */              /*   41: 5cbe10 */\
+/* sqr(t[0], t[0]); */              /*   42: b97c20 */\
+/* sqr(t[0], t[0]); */              /*   43: 172f840 */\
+sqr_n_mul(t[0], t[0], 6, t[17]);    /*   44: 172f87d */\
+/* sqr(t[0], t[0]); */              /*   45: 2e5f0fa */\
+/* sqr(t[0], t[0]); */              /*   46: 5cbe1f4 */\
+/* sqr(t[0], t[0]); */              /*   47: b97c3e8 */\
+/* sqr(t[0], t[0]); */              /*   48: 172f87d0 */\
+/* sqr(t[0], t[0]); */              /*   49: 2e5f0fa0 */\
+/* sqr(t[0], t[0]); */              /*   50: 5cbe1f40 */\
+sqr_n_mul(t[0], t[0], 6, t[16]);    /*   51: 5cbe1f75 */\
+/* sqr(t[0], t[0]); */              /*   52: b97c3eea */\
+/* sqr(t[0], t[0]); */              /*   53: 172f87dd4 */\
+/* sqr(t[0], t[0]); */              /*   54: 2e5f0fba8 */\
+/* sqr(t[0], t[0]); */              /*   55: 5cbe1f750 */\
+/* sqr(t[0], t[0]); */              /*   56: b97c3eea0 */\
+sqr_n_mul(t[0], t[0], 5, t[15]);    /*   57: b97c3eeb7 */\
+/* sqr(t[0], t[0]); */              /*   58: 172f87dd6e */\
+/* sqr(t[0], t[0]); */              /*   59: 2e5f0fbadc */\
+/* sqr(t[0], t[0]); */              /*   60: 5cbe1f75b8 */\
+/* sqr(t[0], t[0]); */              /*   61: b97c3eeb70 */\
+/* sqr(t[0], t[0]); */              /*   62: 172f87dd6e0 */\
+/* sqr(t[0], t[0]); */              /*   63: 2e5f0fbadc0 */\
+sqr_n_mul(t[0], t[0], 6, t[15]);    /*   64: 2e5f0fbadd7 */\
+/* sqr(t[0], t[0]); */              /*   65: 5cbe1f75bae */\
+/* sqr(t[0], t[0]); */              /*   66: b97c3eeb75c */\
+/* sqr(t[0], t[0]); */              /*   67: 172f87dd6eb8 */\
+/* sqr(t[0], t[0]); */              /*   68: 2e5f0fbadd70 */\
+/* sqr(t[0], t[0]); */              /*   69: 5cbe1f75bae0 */\
+/* sqr(t[0], t[0]); */              /*   70: b97c3eeb75c0 */\
+/* sqr(t[0], t[0]); */              /*   71: 172f87dd6eb80 */\
+/* sqr(t[0], t[0]); */              /*   72: 2e5f0fbadd700 */\
+sqr_n_mul(t[0], t[0], 8, t[14]);    /*   73: 2e5f0fbadd723 */\
+/* sqr(t[0], t[0]); */              /*   74: 5cbe1f75bae46 */\
+/* sqr(t[0], t[0]); */              /*   75: b97c3eeb75c8c */\
+/* sqr(t[0], t[0]); */              /*   76: 172f87dd6eb918 */\
+/* sqr(t[0], t[0]); */              /*   77: 2e5f0fbadd7230 */\
+/* sqr(t[0], t[0]); */              /*   78: 5cbe1f75bae460 */\
+/* sqr(t[0], t[0]); */              /*   79: b97c3eeb75c8c0 */\
+/* sqr(t[0], t[0]); */              /*   80: 172f87dd6eb9180 */\
+/* sqr(t[0], t[0]); */              /*   81: 2e5f0fbadd72300 */\
+sqr_n_mul(t[0], t[0], 8, t[13]);    /*   82: 2e5f0fbadd72321 */\
+/* sqr(t[0], t[0]); */              /*   83: 5cbe1f75bae4642 */\
+/* sqr(t[0], t[0]); */              /*   84: b97c3eeb75c8c84 */\
+/* sqr(t[0], t[0]); */              /*   85: 172f87dd6eb91908 */\
+/* sqr(t[0], t[0]); */              /*   86: 2e5f0fbadd723210 */\
+/* sqr(t[0], t[0]); */              /*   87: 5cbe1f75bae46420 */\
+/* sqr(t[0], t[0]); */              /*   88: b97c3eeb75c8c840 */\
+sqr_n_mul(t[0], t[0], 6, t[2]);     /*   89: b97c3eeb75c8c873 */\
+/* sqr(t[0], t[0]); */              /*   90: 172f87dd6eb9190e6 */\
+/* sqr(t[0], t[0]); */              /*   91: 2e5f0fbadd72321cc */\
+/* sqr(t[0], t[0]); */              /*   92: 5cbe1f75bae464398 */\
+/* sqr(t[0], t[0]); */              /*   93: b97c3eeb75c8c8730 */\
+/* sqr(t[0], t[0]); */              /*   94: 172f87dd6eb9190e60 */\
+/* sqr(t[0], t[0]); */              /*   95: 2e5f0fbadd72321cc0 */\
+sqr_n_mul(t[0], t[0], 6, t[13]);    /*   96: 2e5f0fbadd72321ce1 */\
+/* sqr(t[0], t[0]); */              /*   97: 5cbe1f75bae46439c2 */\
+/* sqr(t[0], t[0]); */              /*   98: b97c3eeb75c8c87384 */\
+/* sqr(t[0], t[0]); */              /*   99: 172f87dd6eb9190e708 */\
+/* sqr(t[0], t[0]); */              /*  100: 2e5f0fbadd72321ce10 */\
+/* sqr(t[0], t[0]); */              /*  101: 5cbe1f75bae46439c20 */\
+/* sqr(t[0], t[0]); */              /*  102: b97c3eeb75c8c873840 */\
+/* sqr(t[0], t[0]); */              /*  103: 172f87dd6eb9190e7080 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*  104: 172f87dd6eb9190e70a5 */\
+/* sqr(t[0], t[0]); */              /*  105: 2e5f0fbadd72321ce14a */\
+/* sqr(t[0], t[0]); */              /*  106: 5cbe1f75bae46439c294 */\
+/* sqr(t[0], t[0]); */              /*  107: b97c3eeb75c8c8738528 */\
+/* sqr(t[0], t[0]); */              /*  108: 172f87dd6eb9190e70a50 */\
+/* sqr(t[0], t[0]); */              /*  109: 2e5f0fbadd72321ce14a0 */\
+/* sqr(t[0], t[0]); */              /*  110: 5cbe1f75bae46439c2940 */\
+/* sqr(t[0], t[0]); */              /*  111: b97c3eeb75c8c87385280 */\
+/* sqr(t[0], t[0]); */              /*  112: 172f87dd6eb9190e70a500 */\
+sqr_n_mul(t[0], t[0], 8, t[11]);    /*  113: 172f87dd6eb9190e70a52b */\
+/* sqr(t[0], t[0]); */              /*  114: 2e5f0fbadd72321ce14a56 */\
+/* sqr(t[0], t[0]); */              /*  115: 5cbe1f75bae46439c294ac */\
+/* sqr(t[0], t[0]); */              /*  116: b97c3eeb75c8c873852958 */\
+/* sqr(t[0], t[0]); */              /*  117: 172f87dd6eb9190e70a52b0 */\
+/* sqr(t[0], t[0]); */              /*  118: 2e5f0fbadd72321ce14a560 */\
+/* sqr(t[0], t[0]); */              /*  119: 5cbe1f75bae46439c294ac0 */\
+sqr_n_mul(t[0], t[0], 6, t[1]);     /*  120: 5cbe1f75bae46439c294acd */\
+/* sqr(t[0], t[0]); */              /*  121: b97c3eeb75c8c873852959a */\
+/* sqr(t[0], t[0]); */              /*  122: 172f87dd6eb9190e70a52b34 */\
+/* sqr(t[0], t[0]); */              /*  123: 2e5f0fbadd72321ce14a5668 */\
+/* sqr(t[0], t[0]); */              /*  124: 5cbe1f75bae46439c294acd0 */\
+/* sqr(t[0], t[0]); */              /*  125: b97c3eeb75c8c873852959a0 */\
+/* sqr(t[0], t[0]); */              /*  126: 172f87dd6eb9190e70a52b340 */\
+/* sqr(t[0], t[0]); */              /*  127: 2e5f0fbadd72321ce14a56680 */\
+/* sqr(t[0], t[0]); */              /*  128: 5cbe1f75bae46439c294acd00 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  129: 5cbe1f75bae46439c294acd33 */\
+/* sqr(t[0], t[0]); */              /*  130: b97c3eeb75c8c873852959a66 */\
+/* sqr(t[0], t[0]); */              /*  131: 172f87dd6eb9190e70a52b34cc */\
+/* sqr(t[0], t[0]); */              /*  132: 2e5f0fbadd72321ce14a566998 */\
+/* sqr(t[0], t[0]); */              /*  133: 5cbe1f75bae46439c294acd330 */\
+/* sqr(t[0], t[0]); */              /*  134: b97c3eeb75c8c873852959a660 */\
+/* sqr(t[0], t[0]); */              /*  135: 172f87dd6eb9190e70a52b34cc0 */\
+sqr_n_mul(t[0], t[0], 6, t[11]);    /*  136: 172f87dd6eb9190e70a52b34ceb */\
+/* sqr(t[0], t[0]); */              /*  137: 2e5f0fbadd72321ce14a56699d6 */\
+/* sqr(t[0], t[0]); */              /*  138: 5cbe1f75bae46439c294acd33ac */\
+/* sqr(t[0], t[0]); */              /*  139: b97c3eeb75c8c873852959a6758 */\
+/* sqr(t[0], t[0]); */              /*  140: 172f87dd6eb9190e70a52b34ceb0 */\
+sqr_n_mul(t[0], t[0], 4, t[10]);    /*  141: 172f87dd6eb9190e70a52b34ceb9 */\
+/* sqr(t[0], t[0]); */              /*  142: 2e5f0fbadd72321ce14a56699d72 */\
+/* sqr(t[0], t[0]); */              /*  143: 5cbe1f75bae46439c294acd33ae4 */\
+/* sqr(t[0], t[0]); */              /*  144: b97c3eeb75c8c873852959a675c8 */\
+/* sqr(t[0], t[0]); */              /*  145: 172f87dd6eb9190e70a52b34ceb90 */\
+/* sqr(t[0], t[0]); */              /*  146: 2e5f0fbadd72321ce14a56699d720 */\
+sqr_n_mul(t[0], t[0], 5, t[8]);     /*  147: 2e5f0fbadd72321ce14a56699d73f */\
+/* sqr(t[0], t[0]); */              /*  148: 5cbe1f75bae46439c294acd33ae7e */\
+/* sqr(t[0], t[0]); */              /*  149: b97c3eeb75c8c873852959a675cfc */\
+/* sqr(t[0], t[0]); */              /*  150: 172f87dd6eb9190e70a52b34ceb9f8 */\
+/* sqr(t[0], t[0]); */              /*  151: 2e5f0fbadd72321ce14a56699d73f0 */\
+/* sqr(t[0], t[0]); */              /*  152: 5cbe1f75bae46439c294acd33ae7e0 */\
+/* sqr(t[0], t[0]); */              /*  153: b97c3eeb75c8c873852959a675cfc0 */\
+/* sqr(t[0], t[0]); */              /*  154: 172f87dd6eb9190e70a52b34ceb9f80 */\
+/* sqr(t[0], t[0]); */              /*  155: 2e5f0fbadd72321ce14a56699d73f00 */\
+/* sqr(t[0], t[0]); */              /*  156: 5cbe1f75bae46439c294acd33ae7e00 */\
+/* sqr(t[0], t[0]); */              /*  157: b97c3eeb75c8c873852959a675cfc00 */\
+/* sqr(t[0], t[0]); */              /*  158: 172f87dd6eb9190e70a52b34ceb9f800 */\
+/* sqr(t[0], t[0]); */              /*  159: 2e5f0fbadd72321ce14a56699d73f000 */\
+/* sqr(t[0], t[0]); */              /*  160: 5cbe1f75bae46439c294acd33ae7e000 */\
+/* sqr(t[0], t[0]); */              /*  161: b97c3eeb75c8c873852959a675cfc000 */\
+/* sqr(t[0], t[0]); */              /*  162: 172f87dd6eb9190e70a52b34ceb9f8000 */\
+sqr_n_mul(t[0], t[0], 15, t[9]);    /*  163: 172f87dd6eb9190e70a52b34ceb9f8011 */\
+/* sqr(t[0], t[0]); */              /*  164: 2e5f0fbadd72321ce14a56699d73f0022 */\
+/* sqr(t[0], t[0]); */              /*  165: 5cbe1f75bae46439c294acd33ae7e0044 */\
+/* sqr(t[0], t[0]); */              /*  166: b97c3eeb75c8c873852959a675cfc0088 */\
+/* sqr(t[0], t[0]); */              /*  167: 172f87dd6eb9190e70a52b34ceb9f80110 */\
+/* sqr(t[0], t[0]); */              /*  168: 2e5f0fbadd72321ce14a56699d73f00220 */\
+/* sqr(t[0], t[0]); */              /*  169: 5cbe1f75bae46439c294acd33ae7e00440 */\
+/* sqr(t[0], t[0]); */              /*  170: b97c3eeb75c8c873852959a675cfc00880 */\
+/* sqr(t[0], t[0]); */              /*  171: 172f87dd6eb9190e70a52b34ceb9f801100 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  172: 172f87dd6eb9190e70a52b34ceb9f80110b */\
+/* sqr(t[0], t[0]); */              /*  173: 2e5f0fbadd72321ce14a56699d73f002216 */\
+/* sqr(t[0], t[0]); */              /*  174: 5cbe1f75bae46439c294acd33ae7e00442c */\
+/* sqr(t[0], t[0]); */              /*  175: b97c3eeb75c8c873852959a675cfc008858 */\
+/* sqr(t[0], t[0]); */              /*  176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\
+/* sqr(t[0], t[0]); */              /*  177: 2e5f0fbadd72321ce14a56699d73f0022160 */\
+sqr_n_mul(t[0], t[0], 5, t[8]);     /*  178: 2e5f0fbadd72321ce14a56699d73f002217f */\
+/* sqr(t[0], t[0]); */              /*  179: 5cbe1f75bae46439c294acd33ae7e00442fe */\
+/* sqr(t[0], t[0]); */              /*  180: b97c3eeb75c8c873852959a675cfc00885fc */\
+/* sqr(t[0], t[0]); */              /*  181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\
+/* sqr(t[0], t[0]); */              /*  182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\
+/* sqr(t[0], t[0]); */              /*  183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\
+/* sqr(t[0], t[0]); */              /*  184: b97c3eeb75c8c873852959a675cfc00885fc0 */\
+/* sqr(t[0], t[0]); */              /*  185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\
+/* sqr(t[0], t[0]); */              /*  186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\
+/* sqr(t[0], t[0]); */              /*  187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\
+/* sqr(t[0], t[0]); */              /*  188: b97c3eeb75c8c873852959a675cfc00885fc00 */\
+sqr_n_mul(t[0], t[0], 10, t[7]);    /*  189: b97c3eeb75c8c873852959a675cfc00885fc39 */\
+/* sqr(t[0], t[0]); */              /*  190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\
+/* sqr(t[0], t[0]); */              /*  191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\
+/* sqr(t[0], t[0]); */              /*  192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\
+/* sqr(t[0], t[0]); */              /*  193: b97c3eeb75c8c873852959a675cfc00885fc390 */\
+/* sqr(t[0], t[0]); */              /*  194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\
+/* sqr(t[0], t[0]); */              /*  195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\
+sqr_n_mul(t[0], t[0], 6, t[6]);     /*  196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\
+/* sqr(t[0], t[0]); */              /*  197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\
+/* sqr(t[0], t[0]); */              /*  198: b97c3eeb75c8c873852959a675cfc00885fc399c */\
+/* sqr(t[0], t[0]); */              /*  199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\
+/* sqr(t[0], t[0]); */              /*  200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\
+/* sqr(t[0], t[0]); */              /*  201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\
+/* sqr(t[0], t[0]); */              /*  203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\
+/* sqr(t[0], t[0]); */              /*  204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\
+/* sqr(t[0], t[0]); */              /*  205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\
+/* sqr(t[0], t[0]); */              /*  206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\
+/* sqr(t[0], t[0]); */              /*  207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\
+/* sqr(t[0], t[0]); */              /*  208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\
+/* sqr(t[0], t[0]); */              /*  209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\
+/* sqr(t[0], t[0]); */              /*  210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\
+/* sqr(t[0], t[0]); */              /*  212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\
+/* sqr(t[0], t[0]); */              /*  213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\
+/* sqr(t[0], t[0]); */              /*  214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\
+/* sqr(t[0], t[0]); */              /*  215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\
+/* sqr(t[0], t[0]); */              /*  216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\
+/* sqr(t[0], t[0]); */              /*  217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\
+/* sqr(t[0], t[0]); */              /*  218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\
+sqr_n_mul(t[0], t[0], 7, t[5]);     /*  219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\
+/* sqr(t[0], t[0]); */              /*  220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\
+/* sqr(t[0], t[0]); */              /*  221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\
+/* sqr(t[0], t[0]); */              /*  222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\
+/* sqr(t[0], t[0]); */              /*  223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\
+/* sqr(t[0], t[0]); */              /*  224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\
+/* sqr(t[0], t[0]); */              /*  225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\
+/* sqr(t[0], t[0]); */              /*  226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\
+/* sqr(t[0], t[0]); */              /*  227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\
+/* sqr(t[0], t[0]); */              /*  228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\
+sqr_n_mul(t[0], t[0], 9, t[2]);     /*  229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\
+/* sqr(t[0], t[0]); */              /*  230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\
+/* sqr(t[0], t[0]); */              /*  231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\
+/* sqr(t[0], t[0]); */              /*  232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\
+/* sqr(t[0], t[0]); */              /*  233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\
+/* sqr(t[0], t[0]); */              /*  234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\
+/* sqr(t[0], t[0]); */              /*  235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\
+/* sqr(t[0], t[0]); */              /*  236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\
+/* sqr(t[0], t[0]); */              /*  238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\
+/* sqr(t[0], t[0]); */              /*  239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\
+/* sqr(t[0], t[0]); */              /*  240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\
+/* sqr(t[0], t[0]); */              /*  241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\
+/* sqr(t[0], t[0]); */              /*  242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\
+/* sqr(t[0], t[0]); */              /*  243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\
+/* sqr(t[0], t[0]); */              /*  244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\
+/* sqr(t[0], t[0]); */              /*  245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\
+/* sqr(t[0], t[0]); */              /*  247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\
+/* sqr(t[0], t[0]); */              /*  248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\
+/* sqr(t[0], t[0]); */              /*  249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\
+/* sqr(t[0], t[0]); */              /*  250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\
+/* sqr(t[0], t[0]); */              /*  251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\
+/* sqr(t[0], t[0]); */              /*  252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\
+/* sqr(t[0], t[0]); */              /*  253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\
+/* sqr(t[0], t[0]); */              /*  254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\
+/* sqr(t[0], t[0]); */              /*  256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\
+/* sqr(t[0], t[0]); */              /*  257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\
+/* sqr(t[0], t[0]); */              /*  258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\
+/* sqr(t[0], t[0]); */              /*  259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\
+/* sqr(t[0], t[0]); */              /*  260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\
+/* sqr(t[0], t[0]); */              /*  261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\
+/* sqr(t[0], t[0]); */              /*  262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\
+/* sqr(t[0], t[0]); */              /*  263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\
+/* sqr(t[0], t[0]); */              /*  265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\
+/* sqr(t[0], t[0]); */              /*  266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\
+/* sqr(t[0], t[0]); */              /*  267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\
+/* sqr(t[0], t[0]); */              /*  268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\
+/* sqr(t[0], t[0]); */              /*  269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\
+/* sqr(t[0], t[0]); */              /*  270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\
+/* sqr(t[0], t[0]); */              /*  272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\
+/* sqr(t[0], t[0]); */              /*  273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\
+/* sqr(t[0], t[0]); */              /*  274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\
+/* sqr(t[0], t[0]); */              /*  275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\
+/* sqr(t[0], t[0]); */              /*  276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\
+/* sqr(t[0], t[0]); */              /*  277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\
+/* sqr(t[0], t[0]); */              /*  278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\
+/* sqr(t[0], t[0]); */              /*  279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\
+/* sqr(t[0], t[0]); */              /*  281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\
+/* sqr(t[0], t[0]); */              /*  282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\
+/* sqr(t[0], t[0]); */              /*  283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\
+/* sqr(t[0], t[0]); */              /*  284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\
+/* sqr(t[0], t[0]); */              /*  285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\
+/* sqr(t[0], t[0]); */              /*  286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\
+/* sqr(t[0], t[0]); */              /*  287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\
+/* sqr(t[0], t[0]); */              /*  288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\
+/* sqr(t[0], t[0]); */              /*  290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\
+/* sqr(t[0], t[0]); */              /*  291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\
+/* sqr(t[0], t[0]); */              /*  292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\
+/* sqr(t[0], t[0]); */              /*  293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\
+/* sqr(t[0], t[0]); */              /*  294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\
+/* sqr(t[0], t[0]); */              /*  295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\
+/* sqr(t[0], t[0]); */              /*  296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\
+/* sqr(t[0], t[0]); */              /*  297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\
+/* sqr(t[0], t[0]); */              /*  299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\
+/* sqr(t[0], t[0]); */              /*  300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\
+/* sqr(t[0], t[0]); */              /*  301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\
+/* sqr(t[0], t[0]); */              /*  302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\
+/* sqr(t[0], t[0]); */              /*  303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\
+/* sqr(t[0], t[0]); */              /*  304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\
+sqr_n_mul(out, t[0], 6, t[1]);      /*  305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\
+} while(0)
diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c
new file mode 100644
index 00000000000..fd028113f3d
--- /dev/null
+++ b/crypto/blst_src/pentaroot.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+static inline void mul_fr(vec384 ret, const vec384 a, const vec384 b)
+{   mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0);   }
+
+static inline void sqr_fr(vec384 ret, const vec384 a)
+{   sqr_mont_sparse_256(ret, a, BLS12_381_r, r0);   }
+
+#ifdef __OPTIMIZE_SIZE__
+void blst_fr_pentaroot(vec256 out, const vec256 inp)
+{
+    static const byte pow[] = {
+        TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199),
+        TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c)
+    };
+    size_t pow_bits = 254;
+    vec256 ret;
+
+    vec_copy(ret, inp, sizeof(ret));  /* ret = inp^1 */
+    --pow_bits; /* most significant bit is set, skip over */
+    while (pow_bits--) {
+        sqr_fr(ret, ret);
+        if (is_bit_set(pow, pow_bits))
+            mul_fr(ret, ret, inp);
+    }
+    vec_copy(out, ret, sizeof(ret));  /* out = ret */
+}
+#else
+# if 0
+/*
+ * "255"-bit variant omits full reductions at the ends of squarings,
+ * not implemented yet[?].
+ */
+static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count,
+                                const vec256 b)
+{   sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b);   }
+# else
+static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count,
+                         const vec256 b)
+{
+    do {
+        sqr_fr(out, a);
+        a = out;
+    } while (--count);
+    mul_fr(out, out, b);
+}
+# endif
+
+# define sqr(ret,a)		sqr_fr(ret,a)
+# define mul(ret,a,b)		mul_fr(ret,a,b)
+# define sqr_n_mul(ret,a,n,b)	sqr_n_mul_fr(ret,a,n,b)
+
+# include "pentaroot-addchain.h"
+void blst_fr_pentaroot(vec256 out, const vec256 inp)
+{   PENTAROOT_MOD_BLS12_381_r(out, inp, vec256);   }
+# undef PENTAROOT_MOD_BLS12_381_r
+
+# undef sqr_n_mul
+# undef sqr
+# undef mul
+#endif
+
+void blst_fr_pentapow(vec256 out, const vec256 inp)
+{
+    vec256 tmp;
+
+    sqr_fr(tmp, inp);
+    sqr_fr(tmp, tmp);
+    mul_fr(out, tmp, inp);
+}
diff --git a/crypto/blst_src/point.h b/crypto/blst_src/point.h
new file mode 100644
index 00000000000..0aa7379671f
--- /dev/null
+++ b/crypto/blst_src/point.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_POINT_H__
+#define __BLS12_381_ASM_POINT_H__
+
+#include "vect.h"
+#include "bytes.h"
+
+#define DECLARE_POINT(ptype, bits) \
+typedef struct { vec##bits X,Y,Z; } ptype; \
+typedef struct { vec##bits X,Y; } ptype##_affine; \
+\
+static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2,	\
+                         const vec##bits a4);				\
+static void ptype##_dadd_affine(ptype *out, const ptype *p1,		\
+                                            const ptype##_affine *p2);	\
+static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2);	\
+static void ptype##_add_affine(ptype *out, const ptype *p1,		\
+                                           const ptype##_affine *p2);	\
+static void ptype##_double(ptype *out, const ptype *p1);		\
+static void ptype##_mult_w5(ptype *out, const ptype *point,		\
+                            const byte *scalar, size_t nbits);		\
+static void ptype##_cneg(ptype *p, limb_t cbit);			\
+static void ptype##_to_affine(ptype##_affine *out, const ptype *in);	\
+static void ptype##_from_Jacobian(ptype *out, const ptype *in);		\
+\
+static inline void ptype##_cswap(ptype *restrict a,			\
+                                 ptype *restrict b, bool_t cbit) {	\
+    vec_cswap(a, b, sizeof(ptype), cbit);				\
+} \
+static inline void ptype##_ccopy(ptype *restrict a,			\
+                                 const ptype *restrict b, bool_t cbit) {\
+    vec_select(a, b, a, sizeof(ptype), cbit);				\
+}
+
+#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \
+typedef struct { vec##bits X,Z; } ptype##xz; \
+\
+static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in);	\
+static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s,		\
+                                  const ptype##xz *p);			\
+static void ptype##xz_ladder_post(ptype *ret,				\
+                                  const ptype##xz *r, const ptype##xz *s, \
+                                  const ptype##xz *p, const vec##bits Y1);\
+\
+static inline void ptype##xz_cswap(ptype##xz *restrict a,		\
+                                   ptype##xz *restrict b, bool_t cbit) {\
+    vec_cswap(a, b, sizeof(ptype##xz), cbit);				\
+}
+
+DECLARE_POINT(POINTonE1, 384)
+
+DECLARE_POINT(POINTonE2, 384x)
+
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#endif
diff --git a/crypto/blst_src/rb_tree.c b/crypto/blst_src/rb_tree.c
new file mode 100644
index 00000000000..207becdad18
--- /dev/null
+++ b/crypto/blst_src/rb_tree.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stddef.h>
+
+/*
+ * Red-black tree tailored for uniqueness test. Amount of messages to be
+ * checked is known prior context initialization, implementation is
+ * insert-only, failure is returned if message is already in the tree.
+ */
+
+struct node {
+    struct node *leafs[2];
+    const void *data;
+    size_t len_n_colour;    /* len<<1 | colour */
+};
+
+struct rb_tree {
+    struct node *root;
+    size_t n_nodes;
+    struct node nodes[1];
+};
+
+static long bytes_compare(const unsigned char *ptr0, size_t len0,
+                          const unsigned char *ptr1, size_t len1)
+{
+    size_t i, len = len0<len1 ? len0 : len1;
+    long a, b;
+
+    for (i=0; i<len; i++) {
+        if ((a = ptr0[i]) != (b = ptr1[i]))
+            return a - b;
+    }
+
+    return (long)len0 - (long)len1;
+}
+
+#define PAINT_BLACK(p)  ((p)->len_n_colour &= ~(size_t)1)
+#define PAINT_RED(p)    ((p)->len_n_colour |= 1)
+#define IS_RED(p)       ((p)->len_n_colour & 1)
+
+static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len)
+{
+    struct node *nodes[8*sizeof(void *)];   /* visited nodes    */
+    unsigned char dirs[8*sizeof(void *)];   /* taken directions */
+    size_t k = 0;                           /* walked distance  */
+    struct node *p, *y, *z;
+
+    for (p = tree->root; p != NULL; k++) {
+        long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1);
+
+        if (cmp == 0)
+            return 0;   /* already in tree, no insertion */
+
+        /* record the step */
+        nodes[k] = p;
+        p = p->leafs[(dirs[k] = cmp>0)];
+    }
+
+    /* allocate new node */
+    z = &tree->nodes[tree->n_nodes++];
+    z->leafs[0] = z->leafs[1] = NULL;
+    z->data = data;
+    z->len_n_colour = len<<1;
+    PAINT_RED(z);
+
+    /* graft |z| */
+    if (k > 0)
+        nodes[k-1]->leafs[dirs[k-1]] = z;
+    else
+        tree->root = z;
+
+    /* re-balance |tree| */
+    while (k >= 2 && IS_RED(y = nodes[k-1])) {
+        size_t ydir = dirs[k-2];
+        struct node *x = nodes[k-2],        /* |z|'s grandparent    */
+                    *s = x->leafs[ydir^1];  /* |z|'s uncle          */
+
+        if (s != NULL && IS_RED(s)) {
+            PAINT_RED(x);
+            PAINT_BLACK(y);
+            PAINT_BLACK(s);
+            k -= 2;
+        } else {
+            if (dirs[k-1] != ydir) {
+                /*    |        |
+                 *    x        x
+                 *   / \        \
+                 *  y   s -> z   s
+                 *   \      /
+                 *    z    y
+                 *   /      \
+                 *  ?        ?
+                 */
+                struct node *t = y;
+                y = y->leafs[ydir^1];
+                t->leafs[ydir^1] = y->leafs[ydir];
+                y->leafs[ydir] = t;
+            }
+
+            /*      |        |
+             *      x        y
+             *       \      / \
+             *    y   s -> z   x
+             *   / \          / \
+             *  z   ?        ?   s
+             */
+            x->leafs[ydir] = y->leafs[ydir^1];
+            y->leafs[ydir^1] = x;
+
+            PAINT_RED(x);
+            PAINT_BLACK(y);
+
+            if (k > 2)
+                nodes[k-3]->leafs[dirs[k-3]] = y;
+            else
+                tree->root = y;
+
+            break;
+        }
+    }
+
+    PAINT_BLACK(tree->root);
+
+    return 1;
+}
+
+#undef IS_RED
+#undef PAINT_RED
+#undef PAINT_BLACK
+
+size_t blst_uniq_sizeof(size_t n_nodes)
+{   return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1);   }
+
+void blst_uniq_init(struct rb_tree *tree)
+{
+    tree->root = NULL;
+    tree->n_nodes = 0;
+}
+
+int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len)
+{   return (int)rb_tree_insert(tree, data, len);   }
diff --git a/crypto/blst_src/recip-addchain.h b/crypto/blst_src/recip-addchain.h
new file mode 100644
index 00000000000..e4e436a3f09
--- /dev/null
+++ b/crypto/blst_src/recip-addchain.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * The "magic" number is BLS12_381_P-2. Exponentiation to which yields
+ * reciprocal to input base.
+ *
+ * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785'
+ * https://github.com/kwantam/addchain
+ *
+ * # Bos-Coster (win=4)           :  461 (16) <<<
+ * # Bos-Coster (win=3)           :  464 ( 9)
+ * # Bos-Coster (win=8)           :  469 (35)
+ * # Bos-Coster (win=5)           :  463 (28)
+ * # Bos-Coster (win=9)           :  467 (32)
+ * # Bos-Coster (win=7)           :  462 (27)
+ * # Yacobi                       :  481 (31)
+ * # Bos-Coster (win=10)          :  475 (30)
+ * # Bos-Coster (win=6)           :  463 (32)
+ * # Bos-Coster (win=2)           :  489 ( 5)
+ * # Bergeron-Berstel-Brlek-Duboc :  498 ( 5)
+ */
+
+#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \
+ptype t[16]; \
+vec_copy(t[1], inp, sizeof(ptype)); /*    0: 1 */\
+sqr(t[0], t[1]);                    /*    1: 2 */\
+mul(t[9], t[0], t[1]);              /*    2: 3 */\
+sqr(t[5], t[0]);                    /*    3: 4 */\
+mul(t[2], t[9], t[0]);              /*    4: 5 */\
+mul(t[7], t[5], t[9]);              /*    5: 7 */\
+mul(t[10], t[2], t[5]);             /*    6: 9 */\
+mul(t[13], t[7], t[5]);             /*    7: b */\
+mul(t[4], t[10], t[5]);             /*    8: d */\
+mul(t[8], t[13], t[5]);             /*    9: f */\
+mul(t[15], t[4], t[5]);             /*   10: 11 */\
+mul(t[11], t[8], t[5]);             /*   11: 13 */\
+mul(t[3], t[15], t[5]);             /*   12: 15 */\
+mul(t[12], t[11], t[5]);            /*   13: 17 */\
+sqr(t[0], t[4]);                    /*   14: 1a */\
+mul(t[14], t[12], t[5]);            /*   15: 1b */\
+mul(t[6], t[0], t[9]);              /*   16: 1d */\
+mul(t[5], t[0], t[2]);              /*   17: 1f */\
+/* sqr(t[0], t[0]); */              /*   18: 34 */\
+/* sqr(t[0], t[0]); */              /*   19: 68 */\
+/* sqr(t[0], t[0]); */              /*   20: d0 */\
+/* sqr(t[0], t[0]); */              /*   21: 1a0 */\
+/* sqr(t[0], t[0]); */              /*   22: 340 */\
+/* sqr(t[0], t[0]); */              /*   23: 680 */\
+/* sqr(t[0], t[0]); */              /*   24: d00 */\
+/* sqr(t[0], t[0]); */              /*   25: 1a00 */\
+/* sqr(t[0], t[0]); */              /*   26: 3400 */\
+/* sqr(t[0], t[0]); */              /*   27: 6800 */\
+/* sqr(t[0], t[0]); */              /*   28: d000 */\
+/* sqr(t[0], t[0]); */              /*   29: 1a000 */\
+sqr_n_mul(t[0], t[0], 12, t[15]);   /*   30: 1a011 */\
+/* sqr(t[0], t[0]); */              /*   31: 34022 */\
+/* sqr(t[0], t[0]); */              /*   32: 68044 */\
+/* sqr(t[0], t[0]); */              /*   33: d0088 */\
+/* sqr(t[0], t[0]); */              /*   34: 1a0110 */\
+/* sqr(t[0], t[0]); */              /*   35: 340220 */\
+/* sqr(t[0], t[0]); */              /*   36: 680440 */\
+/* sqr(t[0], t[0]); */              /*   37: d00880 */\
+sqr_n_mul(t[0], t[0], 7, t[8]);     /*   38: d0088f */\
+/* sqr(t[0], t[0]); */              /*   39: 1a0111e */\
+/* sqr(t[0], t[0]); */              /*   40: 340223c */\
+/* sqr(t[0], t[0]); */              /*   41: 6804478 */\
+/* sqr(t[0], t[0]); */              /*   42: d0088f0 */\
+sqr_n_mul(t[0], t[0], 4, t[2]);     /*   43: d0088f5 */\
+/* sqr(t[0], t[0]); */              /*   44: 1a0111ea */\
+/* sqr(t[0], t[0]); */              /*   45: 340223d4 */\
+/* sqr(t[0], t[0]); */              /*   46: 680447a8 */\
+/* sqr(t[0], t[0]); */              /*   47: d0088f50 */\
+/* sqr(t[0], t[0]); */              /*   48: 1a0111ea0 */\
+/* sqr(t[0], t[0]); */              /*   49: 340223d40 */\
+sqr_n_mul(t[0], t[0], 6, t[7]);     /*   50: 340223d47 */\
+/* sqr(t[0], t[0]); */              /*   51: 680447a8e */\
+/* sqr(t[0], t[0]); */              /*   52: d0088f51c */\
+/* sqr(t[0], t[0]); */              /*   53: 1a0111ea38 */\
+/* sqr(t[0], t[0]); */              /*   54: 340223d470 */\
+/* sqr(t[0], t[0]); */              /*   55: 680447a8e0 */\
+/* sqr(t[0], t[0]); */              /*   56: d0088f51c0 */\
+/* sqr(t[0], t[0]); */              /*   57: 1a0111ea380 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*   58: 1a0111ea397 */\
+/* sqr(t[0], t[0]); */              /*   59: 340223d472e */\
+/* sqr(t[0], t[0]); */              /*   60: 680447a8e5c */\
+/* sqr(t[0], t[0]); */              /*   61: d0088f51cb8 */\
+/* sqr(t[0], t[0]); */              /*   62: 1a0111ea3970 */\
+/* sqr(t[0], t[0]); */              /*   63: 340223d472e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*   64: 340223d472ff */\
+/* sqr(t[0], t[0]); */              /*   65: 680447a8e5fe */\
+/* sqr(t[0], t[0]); */              /*   66: d0088f51cbfc */\
+sqr_n_mul(t[0], t[0], 2, t[9]);     /*   67: d0088f51cbff */\
+/* sqr(t[0], t[0]); */              /*   68: 1a0111ea397fe */\
+/* sqr(t[0], t[0]); */              /*   69: 340223d472ffc */\
+/* sqr(t[0], t[0]); */              /*   70: 680447a8e5ff8 */\
+/* sqr(t[0], t[0]); */              /*   71: d0088f51cbff0 */\
+/* sqr(t[0], t[0]); */              /*   72: 1a0111ea397fe0 */\
+/* sqr(t[0], t[0]); */              /*   73: 340223d472ffc0 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*   74: 340223d472ffcd */\
+/* sqr(t[0], t[0]); */              /*   75: 680447a8e5ff9a */\
+/* sqr(t[0], t[0]); */              /*   76: d0088f51cbff34 */\
+/* sqr(t[0], t[0]); */              /*   77: 1a0111ea397fe68 */\
+/* sqr(t[0], t[0]); */              /*   78: 340223d472ffcd0 */\
+/* sqr(t[0], t[0]); */              /*   79: 680447a8e5ff9a0 */\
+/* sqr(t[0], t[0]); */              /*   80: d0088f51cbff340 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*   81: d0088f51cbff34d */\
+/* sqr(t[0], t[0]); */              /*   82: 1a0111ea397fe69a */\
+/* sqr(t[0], t[0]); */              /*   83: 340223d472ffcd34 */\
+/* sqr(t[0], t[0]); */              /*   84: 680447a8e5ff9a68 */\
+/* sqr(t[0], t[0]); */              /*   85: d0088f51cbff34d0 */\
+/* sqr(t[0], t[0]); */              /*   86: 1a0111ea397fe69a0 */\
+/* sqr(t[0], t[0]); */              /*   87: 340223d472ffcd340 */\
+sqr_n_mul(t[0], t[0], 6, t[10]);    /*   88: 340223d472ffcd349 */\
+/* sqr(t[0], t[0]); */              /*   89: 680447a8e5ff9a692 */\
+/* sqr(t[0], t[0]); */              /*   90: d0088f51cbff34d24 */\
+/* sqr(t[0], t[0]); */              /*   91: 1a0111ea397fe69a48 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*   92: 1a0111ea397fe69a4b */\
+/* sqr(t[0], t[0]); */              /*   93: 340223d472ffcd3496 */\
+/* sqr(t[0], t[0]); */              /*   94: 680447a8e5ff9a692c */\
+/* sqr(t[0], t[0]); */              /*   95: d0088f51cbff34d258 */\
+/* sqr(t[0], t[0]); */              /*   96: 1a0111ea397fe69a4b0 */\
+/* sqr(t[0], t[0]); */              /*   97: 340223d472ffcd34960 */\
+/* sqr(t[0], t[0]); */              /*   98: 680447a8e5ff9a692c0 */\
+/* sqr(t[0], t[0]); */              /*   99: d0088f51cbff34d2580 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  100: d0088f51cbff34d258d */\
+/* sqr(t[0], t[0]); */              /*  101: 1a0111ea397fe69a4b1a */\
+/* sqr(t[0], t[0]); */              /*  102: 340223d472ffcd349634 */\
+/* sqr(t[0], t[0]); */              /*  103: 680447a8e5ff9a692c68 */\
+/* sqr(t[0], t[0]); */              /*  104: d0088f51cbff34d258d0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  105: d0088f51cbff34d258dd */\
+/* sqr(t[0], t[0]); */              /*  106: 1a0111ea397fe69a4b1ba */\
+/* sqr(t[0], t[0]); */              /*  107: 340223d472ffcd3496374 */\
+/* sqr(t[0], t[0]); */              /*  108: 680447a8e5ff9a692c6e8 */\
+/* sqr(t[0], t[0]); */              /*  109: d0088f51cbff34d258dd0 */\
+/* sqr(t[0], t[0]); */              /*  110: 1a0111ea397fe69a4b1ba0 */\
+/* sqr(t[0], t[0]); */              /*  111: 340223d472ffcd34963740 */\
+sqr_n_mul(t[0], t[0], 6, t[8]);     /*  112: 340223d472ffcd3496374f */\
+/* sqr(t[0], t[0]); */              /*  113: 680447a8e5ff9a692c6e9e */\
+/* sqr(t[0], t[0]); */              /*  114: d0088f51cbff34d258dd3c */\
+/* sqr(t[0], t[0]); */              /*  115: 1a0111ea397fe69a4b1ba78 */\
+/* sqr(t[0], t[0]); */              /*  116: 340223d472ffcd3496374f0 */\
+/* sqr(t[0], t[0]); */              /*  117: 680447a8e5ff9a692c6e9e0 */\
+/* sqr(t[0], t[0]); */              /*  118: d0088f51cbff34d258dd3c0 */\
+sqr_n_mul(t[0], t[0], 6, t[14]);    /*  119: d0088f51cbff34d258dd3db */\
+/* sqr(t[0], t[0]); */              /*  120: 1a0111ea397fe69a4b1ba7b6 */\
+/* sqr(t[0], t[0]); */              /*  121: 340223d472ffcd3496374f6c */\
+/* sqr(t[0], t[0]); */              /*  122: 680447a8e5ff9a692c6e9ed8 */\
+sqr_n_mul(t[0], t[0], 3, t[1]);     /*  123: 680447a8e5ff9a692c6e9ed9 */\
+/* sqr(t[0], t[0]); */              /*  124: d0088f51cbff34d258dd3db2 */\
+/* sqr(t[0], t[0]); */              /*  125: 1a0111ea397fe69a4b1ba7b64 */\
+/* sqr(t[0], t[0]); */              /*  126: 340223d472ffcd3496374f6c8 */\
+/* sqr(t[0], t[0]); */              /*  127: 680447a8e5ff9a692c6e9ed90 */\
+/* sqr(t[0], t[0]); */              /*  128: d0088f51cbff34d258dd3db20 */\
+/* sqr(t[0], t[0]); */              /*  129: 1a0111ea397fe69a4b1ba7b640 */\
+/* sqr(t[0], t[0]); */              /*  130: 340223d472ffcd3496374f6c80 */\
+/* sqr(t[0], t[0]); */              /*  131: 680447a8e5ff9a692c6e9ed900 */\
+sqr_n_mul(t[0], t[0], 8, t[4]);     /*  132: 680447a8e5ff9a692c6e9ed90d */\
+/* sqr(t[0], t[0]); */              /*  133: d0088f51cbff34d258dd3db21a */\
+/* sqr(t[0], t[0]); */              /*  134: 1a0111ea397fe69a4b1ba7b6434 */\
+/* sqr(t[0], t[0]); */              /*  135: 340223d472ffcd3496374f6c868 */\
+/* sqr(t[0], t[0]); */              /*  136: 680447a8e5ff9a692c6e9ed90d0 */\
+/* sqr(t[0], t[0]); */              /*  137: d0088f51cbff34d258dd3db21a0 */\
+/* sqr(t[0], t[0]); */              /*  138: 1a0111ea397fe69a4b1ba7b64340 */\
+/* sqr(t[0], t[0]); */              /*  139: 340223d472ffcd3496374f6c8680 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*  140: 340223d472ffcd3496374f6c8697 */\
+/* sqr(t[0], t[0]); */              /*  141: 680447a8e5ff9a692c6e9ed90d2e */\
+/* sqr(t[0], t[0]); */              /*  142: d0088f51cbff34d258dd3db21a5c */\
+/* sqr(t[0], t[0]); */              /*  143: 1a0111ea397fe69a4b1ba7b6434b8 */\
+/* sqr(t[0], t[0]); */              /*  144: 340223d472ffcd3496374f6c86970 */\
+/* sqr(t[0], t[0]); */              /*  145: 680447a8e5ff9a692c6e9ed90d2e0 */\
+sqr_n_mul(t[0], t[0], 5, t[13]);    /*  146: 680447a8e5ff9a692c6e9ed90d2eb */\
+/* sqr(t[0], t[0]); */              /*  147: d0088f51cbff34d258dd3db21a5d6 */\
+/* sqr(t[0], t[0]); */              /*  148: 1a0111ea397fe69a4b1ba7b6434bac */\
+/* sqr(t[0], t[0]); */              /*  149: 340223d472ffcd3496374f6c869758 */\
+/* sqr(t[0], t[0]); */              /*  150: 680447a8e5ff9a692c6e9ed90d2eb0 */\
+/* sqr(t[0], t[0]); */              /*  151: d0088f51cbff34d258dd3db21a5d60 */\
+/* sqr(t[0], t[0]); */              /*  152: 1a0111ea397fe69a4b1ba7b6434bac0 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*  153: 1a0111ea397fe69a4b1ba7b6434bacd */\
+/* sqr(t[0], t[0]); */              /*  154: 340223d472ffcd3496374f6c869759a */\
+/* sqr(t[0], t[0]); */              /*  155: 680447a8e5ff9a692c6e9ed90d2eb34 */\
+/* sqr(t[0], t[0]); */              /*  156: d0088f51cbff34d258dd3db21a5d668 */\
+/* sqr(t[0], t[0]); */              /*  157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\
+/* sqr(t[0], t[0]); */              /*  158: 340223d472ffcd3496374f6c869759a0 */\
+/* sqr(t[0], t[0]); */              /*  159: 680447a8e5ff9a692c6e9ed90d2eb340 */\
+sqr_n_mul(t[0], t[0], 6, t[6]);     /*  160: 680447a8e5ff9a692c6e9ed90d2eb35d */\
+/* sqr(t[0], t[0]); */              /*  161: d0088f51cbff34d258dd3db21a5d66ba */\
+/* sqr(t[0], t[0]); */              /*  162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\
+/* sqr(t[0], t[0]); */              /*  163: 340223d472ffcd3496374f6c869759ae8 */\
+/* sqr(t[0], t[0]); */              /*  164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\
+sqr_n_mul(t[0], t[0], 4, t[10]);    /*  165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\
+/* sqr(t[0], t[0]); */              /*  166: d0088f51cbff34d258dd3db21a5d66bb2 */\
+/* sqr(t[0], t[0]); */              /*  167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\
+/* sqr(t[0], t[0]); */              /*  168: 340223d472ffcd3496374f6c869759aec8 */\
+/* sqr(t[0], t[0]); */              /*  169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\
+/* sqr(t[0], t[0]); */              /*  170: d0088f51cbff34d258dd3db21a5d66bb20 */\
+/* sqr(t[0], t[0]); */              /*  171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\
+/* sqr(t[0], t[0]); */              /*  172: 340223d472ffcd3496374f6c869759aec80 */\
+/* sqr(t[0], t[0]); */              /*  173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\
+sqr_n_mul(t[0], t[0], 8, t[6]);     /*  174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\
+/* sqr(t[0], t[0]); */              /*  175: d0088f51cbff34d258dd3db21a5d66bb23a */\
+/* sqr(t[0], t[0]); */              /*  176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\
+/* sqr(t[0], t[0]); */              /*  177: 340223d472ffcd3496374f6c869759aec8e8 */\
+/* sqr(t[0], t[0]); */              /*  178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\
+/* sqr(t[0], t[0]); */              /*  180: d0088f51cbff34d258dd3db21a5d66bb23ba */\
+/* sqr(t[0], t[0]); */              /*  181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\
+/* sqr(t[0], t[0]); */              /*  182: 340223d472ffcd3496374f6c869759aec8ee8 */\
+/* sqr(t[0], t[0]); */              /*  183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\
+/* sqr(t[0], t[0]); */              /*  184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\
+/* sqr(t[0], t[0]); */              /*  185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\
+/* sqr(t[0], t[0]); */              /*  186: 340223d472ffcd3496374f6c869759aec8ee80 */\
+sqr_n_mul(t[0], t[0], 7, t[12]);    /*  187: 340223d472ffcd3496374f6c869759aec8ee97 */\
+/* sqr(t[0], t[0]); */              /*  188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\
+/* sqr(t[0], t[0]); */              /*  189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\
+/* sqr(t[0], t[0]); */              /*  190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\
+/* sqr(t[0], t[0]); */              /*  191: 340223d472ffcd3496374f6c869759aec8ee970 */\
+/* sqr(t[0], t[0]); */              /*  192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\
+/* sqr(t[0], t[0]); */              /*  193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\
+/* sqr(t[0], t[0]); */              /*  194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\
+/* sqr(t[0], t[0]); */              /*  195: 340223d472ffcd3496374f6c869759aec8ee9700 */\
+/* sqr(t[0], t[0]); */              /*  196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\
+sqr_n_mul(t[0], t[0], 9, t[11]);    /*  197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\
+/* sqr(t[0], t[0]); */              /*  198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\
+/* sqr(t[0], t[0]); */              /*  199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\
+sqr_n_mul(t[0], t[0], 2, t[9]);     /*  200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\
+/* sqr(t[0], t[0]); */              /*  201: 340223d472ffcd3496374f6c869759aec8ee9709e */\
+/* sqr(t[0], t[0]); */              /*  202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\
+/* sqr(t[0], t[0]); */              /*  203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\
+/* sqr(t[0], t[0]); */              /*  204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\
+/* sqr(t[0], t[0]); */              /*  205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\
+sqr_n_mul(t[0], t[0], 5, t[7]);     /*  206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\
+/* sqr(t[0], t[0]); */              /*  207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\
+/* sqr(t[0], t[0]); */              /*  208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\
+/* sqr(t[0], t[0]); */              /*  209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\
+/* sqr(t[0], t[0]); */              /*  210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\
+/* sqr(t[0], t[0]); */              /*  211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\
+/* sqr(t[0], t[0]); */              /*  212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\
+/* sqr(t[0], t[0]); */              /*  213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\
+sqr_n_mul(t[0], t[0], 7, t[2]);     /*  214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\
+/* sqr(t[0], t[0]); */              /*  215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\
+/* sqr(t[0], t[0]); */              /*  216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\
+/* sqr(t[0], t[0]); */              /*  217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\
+/* sqr(t[0], t[0]); */              /*  218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\
+/* sqr(t[0], t[0]); */              /*  219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\
+/* sqr(t[0], t[0]); */              /*  220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\
+/* sqr(t[0], t[0]); */              /*  221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\
+sqr_n_mul(t[0], t[0], 7, t[10]);    /*  222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\
+/* sqr(t[0], t[0]); */              /*  223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\
+/* sqr(t[0], t[0]); */              /*  224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\
+/* sqr(t[0], t[0]); */              /*  225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\
+/* sqr(t[0], t[0]); */              /*  226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\
+/* sqr(t[0], t[0]); */              /*  227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\
+/* sqr(t[0], t[0]); */              /*  228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\
+sqr_n_mul(t[0], t[0], 6, t[12]);    /*  229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\
+/* sqr(t[0], t[0]); */              /*  230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\
+/* sqr(t[0], t[0]); */              /*  231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\
+/* sqr(t[0], t[0]); */              /*  232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\
+/* sqr(t[0], t[0]); */              /*  233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\
+/* sqr(t[0], t[0]); */              /*  234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\
+/* sqr(t[0], t[0]); */              /*  236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\
+/* sqr(t[0], t[0]); */              /*  237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\
+/* sqr(t[0], t[0]); */              /*  238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\
+/* sqr(t[0], t[0]); */              /*  239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\
+/* sqr(t[0], t[0]); */              /*  240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[11]);    /*  241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\
+/* sqr(t[0], t[0]); */              /*  242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\
+/* sqr(t[0], t[0]); */              /*  243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\
+/* sqr(t[0], t[0]); */              /*  244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\
+/* sqr(t[0], t[0]); */              /*  245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\
+/* sqr(t[0], t[0]); */              /*  246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\
+sqr_n_mul(t[0], t[0], 5, t[11]);    /*  247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\
+/* sqr(t[0], t[0]); */              /*  248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\
+/* sqr(t[0], t[0]); */              /*  249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\
+/* sqr(t[0], t[0]); */              /*  250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\
+/* sqr(t[0], t[0]); */              /*  251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\
+/* sqr(t[0], t[0]); */              /*  252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\
+/* sqr(t[0], t[0]); */              /*  253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\
+/* sqr(t[0], t[0]); */              /*  254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\
+/* sqr(t[0], t[0]); */              /*  255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\
+sqr_n_mul(t[0], t[0], 8, t[4]);     /*  256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\
+/* sqr(t[0], t[0]); */              /*  257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\
+/* sqr(t[0], t[0]); */              /*  258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\
+/* sqr(t[0], t[0]); */              /*  259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\
+/* sqr(t[0], t[0]); */              /*  260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\
+/* sqr(t[0], t[0]); */              /*  261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\
+/* sqr(t[0], t[0]); */              /*  262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\
+/* sqr(t[0], t[0]); */              /*  263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\
+sqr_n_mul(t[0], t[0], 7, t[3]);     /*  264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\
+/* sqr(t[0], t[0]); */              /*  265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\
+/* sqr(t[0], t[0]); */              /*  266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\
+/* sqr(t[0], t[0]); */              /*  267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\
+/* sqr(t[0], t[0]); */              /*  268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\
+/* sqr(t[0], t[0]); */              /*  269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\
+/* sqr(t[0], t[0]); */              /*  270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\
+/* sqr(t[0], t[0]); */              /*  271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\
+/* sqr(t[0], t[0]); */              /*  272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\
+/* sqr(t[0], t[0]); */              /*  273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\
+sqr_n_mul(t[0], t[0], 9, t[8]);     /*  274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\
+/* sqr(t[0], t[0]); */              /*  275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\
+/* sqr(t[0], t[0]); */              /*  276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\
+/* sqr(t[0], t[0]); */              /*  277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\
+/* sqr(t[0], t[0]); */              /*  278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\
+/* sqr(t[0], t[0]); */              /*  279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\
+/* sqr(t[0], t[0]); */              /*  281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\
+/* sqr(t[0], t[0]); */              /*  282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\
+/* sqr(t[0], t[0]); */              /*  283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\
+/* sqr(t[0], t[0]); */              /*  285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\
+/* sqr(t[0], t[0]); */              /*  286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\
+/* sqr(t[0], t[0]); */              /*  287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\
+/* sqr(t[0], t[0]); */              /*  288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\
+/* sqr(t[0], t[0]); */              /*  289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\
+/* sqr(t[0], t[0]); */              /*  290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\
+/* sqr(t[0], t[0]); */              /*  291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\
+/* sqr(t[0], t[0]); */              /*  292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\
+sqr_n_mul(t[0], t[0], 8, t[8]);     /*  293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\
+/* sqr(t[0], t[0]); */              /*  294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\
+/* sqr(t[0], t[0]); */              /*  295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\
+/* sqr(t[0], t[0]); */              /*  296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\
+/* sqr(t[0], t[0]); */              /*  298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\
+/* sqr(t[0], t[0]); */              /*  299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\
+/* sqr(t[0], t[0]); */              /*  300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\
+/* sqr(t[0], t[0]); */              /*  301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\
+/* sqr(t[0], t[0]); */              /*  302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\
+/* sqr(t[0], t[0]); */              /*  303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\
+/* sqr(t[0], t[0]); */              /*  304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\
+sqr_n_mul(t[0], t[0], 7, t[10]);    /*  305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\
+/* sqr(t[0], t[0]); */              /*  306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\
+/* sqr(t[0], t[0]); */              /*  307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\
+/* sqr(t[0], t[0]); */              /*  308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\
+/* sqr(t[0], t[0]); */              /*  309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\
+/* sqr(t[0], t[0]); */              /*  310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\
+/* sqr(t[0], t[0]); */              /*  311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\
+/* sqr(t[0], t[0]); */              /*  312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\
+/* sqr(t[0], t[0]); */              /*  313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\
+/* sqr(t[0], t[0]); */              /*  314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\
+sqr_n_mul(t[0], t[0], 9, t[8]);     /*  315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\
+/* sqr(t[0], t[0]); */              /*  316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\
+/* sqr(t[0], t[0]); */              /*  317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\
+/* sqr(t[0], t[0]); */              /*  318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\
+/* sqr(t[0], t[0]); */              /*  319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\
+/* sqr(t[0], t[0]); */              /*  320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\
+/* sqr(t[0], t[0]); */              /*  321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\
+/* sqr(t[0], t[0]); */              /*  323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\
+/* sqr(t[0], t[0]); */              /*  324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\
+/* sqr(t[0], t[0]); */              /*  325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\
+/* sqr(t[0], t[0]); */              /*  326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\
+/* sqr(t[0], t[0]); */              /*  327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\
+/* sqr(t[0], t[0]); */              /*  328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\
+sqr_n_mul(t[0], t[0], 6, t[5]);     /*  329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\
+/* sqr(t[0], t[0]); */              /*  330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\
+/* sqr(t[0], t[0]); */              /*  331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\
+/* sqr(t[0], t[0]); */              /*  332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\
+/* sqr(t[0], t[0]); */              /*  333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\
+/* sqr(t[0], t[0]); */              /*  334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\
+/* sqr(t[0], t[0]); */              /*  336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\
+/* sqr(t[0], t[0]); */              /*  337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\
+/* sqr(t[0], t[0]); */              /*  338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\
+/* sqr(t[0], t[0]); */              /*  339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\
+/* sqr(t[0], t[0]); */              /*  340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\
+/* sqr(t[0], t[0]); */              /*  342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\
+/* sqr(t[0], t[0]); */              /*  343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\
+/* sqr(t[0], t[0]); */              /*  344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\
+/* sqr(t[0], t[0]); */              /*  345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\
+/* sqr(t[0], t[0]); */              /*  347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\
+/* sqr(t[0], t[0]); */              /*  348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\
+/* sqr(t[0], t[0]); */              /*  349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\
+sqr_n_mul(t[0], t[0], 3, t[9]);     /*  350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\
+/* sqr(t[0], t[0]); */              /*  351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\
+/* sqr(t[0], t[0]); */              /*  352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\
+/* sqr(t[0], t[0]); */              /*  353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\
+/* sqr(t[0], t[0]); */              /*  354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\
+/* sqr(t[0], t[0]); */              /*  355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\
+/* sqr(t[0], t[0]); */              /*  356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\
+/* sqr(t[0], t[0]); */              /*  357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\
+/* sqr(t[0], t[0]); */              /*  358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\
+/* sqr(t[0], t[0]); */              /*  360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\
+/* sqr(t[0], t[0]); */              /*  361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\
+/* sqr(t[0], t[0]); */              /*  362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\
+/* sqr(t[0], t[0]); */              /*  363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\
+/* sqr(t[0], t[0]); */              /*  364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\
+/* sqr(t[0], t[0]); */              /*  365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\
+/* sqr(t[0], t[0]); */              /*  366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\
+sqr_n_mul(t[0], t[0], 7, t[5]);     /*  367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\
+/* sqr(t[0], t[0]); */              /*  368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\
+/* sqr(t[0], t[0]); */              /*  369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\
+/* sqr(t[0], t[0]); */              /*  370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\
+/* sqr(t[0], t[0]); */              /*  371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\
+/* sqr(t[0], t[0]); */              /*  372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\
+/* sqr(t[0], t[0]); */              /*  374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\
+/* sqr(t[0], t[0]); */              /*  375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\
+/* sqr(t[0], t[0]); */              /*  376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\
+/* sqr(t[0], t[0]); */              /*  377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\
+/* sqr(t[0], t[0]); */              /*  378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\
+/* sqr(t[0], t[0]); */              /*  380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\
+/* sqr(t[0], t[0]); */              /*  381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\
+/* sqr(t[0], t[0]); */              /*  382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\
+/* sqr(t[0], t[0]); */              /*  383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[8]);     /*  384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\
+/* sqr(t[0], t[0]); */              /*  385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\
+/* sqr(t[0], t[0]); */              /*  386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\
+/* sqr(t[0], t[0]); */              /*  387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\
+/* sqr(t[0], t[0]); */              /*  388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[7]);     /*  389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\
+/* sqr(t[0], t[0]); */              /*  390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\
+/* sqr(t[0], t[0]); */              /*  391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\
+/* sqr(t[0], t[0]); */              /*  392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\
+/* sqr(t[0], t[0]); */              /*  393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\
+/* sqr(t[0], t[0]); */              /*  394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\
+/* sqr(t[0], t[0]); */              /*  395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\
+/* sqr(t[0], t[0]); */              /*  396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\
+sqr_n_mul(t[0], t[0], 7, t[5]);     /*  397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\
+/* sqr(t[0], t[0]); */              /*  398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\
+/* sqr(t[0], t[0]); */              /*  399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\
+/* sqr(t[0], t[0]); */              /*  400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\
+/* sqr(t[0], t[0]); */              /*  401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\
+/* sqr(t[0], t[0]); */              /*  402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\
+/* sqr(t[0], t[0]); */              /*  404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\
+/* sqr(t[0], t[0]); */              /*  405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\
+/* sqr(t[0], t[0]); */              /*  406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\
+/* sqr(t[0], t[0]); */              /*  407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\
+/* sqr(t[0], t[0]); */              /*  408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\
+/* sqr(t[0], t[0]); */              /*  410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\
+/* sqr(t[0], t[0]); */              /*  411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\
+/* sqr(t[0], t[0]); */              /*  412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\
+/* sqr(t[0], t[0]); */              /*  413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\
+/* sqr(t[0], t[0]); */              /*  414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\
+/* sqr(t[0], t[0]); */              /*  416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\
+/* sqr(t[0], t[0]); */              /*  417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\
+/* sqr(t[0], t[0]); */              /*  418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\
+/* sqr(t[0], t[0]); */              /*  419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\
+/* sqr(t[0], t[0]); */              /*  420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\
+/* sqr(t[0], t[0]); */              /*  422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\
+/* sqr(t[0], t[0]); */              /*  423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\
+/* sqr(t[0], t[0]); */              /*  424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\
+/* sqr(t[0], t[0]); */              /*  425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\
+/* sqr(t[0], t[0]); */              /*  426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\
+/* sqr(t[0], t[0]); */              /*  428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\
+/* sqr(t[0], t[0]); */              /*  429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\
+/* sqr(t[0], t[0]); */              /*  430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\
+/* sqr(t[0], t[0]); */              /*  431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\
+/* sqr(t[0], t[0]); */              /*  432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\
+/* sqr(t[0], t[0]); */              /*  434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\
+/* sqr(t[0], t[0]); */              /*  435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\
+/* sqr(t[0], t[0]); */              /*  436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\
+/* sqr(t[0], t[0]); */              /*  437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\
+/* sqr(t[0], t[0]); */              /*  438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\
+/* sqr(t[0], t[0]); */              /*  440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\
+/* sqr(t[0], t[0]); */              /*  441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\
+/* sqr(t[0], t[0]); */              /*  442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\
+/* sqr(t[0], t[0]); */              /*  443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[4]);     /*  444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\
+/* sqr(t[0], t[0]); */              /*  445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\
+/* sqr(t[0], t[0]); */              /*  446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\
+/* sqr(t[0], t[0]); */              /*  447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\
+/* sqr(t[0], t[0]); */              /*  448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\
+/* sqr(t[0], t[0]); */              /*  449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\
+/* sqr(t[0], t[0]); */              /*  450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\
+/* sqr(t[0], t[0]); */              /*  452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\
+/* sqr(t[0], t[0]); */              /*  453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\
+/* sqr(t[0], t[0]); */              /*  454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\
+/* sqr(t[0], t[0]); */              /*  455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\
+sqr_n_mul(t[0], t[0], 4, t[2]);     /*  456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\
+/* sqr(t[0], t[0]); */              /*  457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\
+/* sqr(t[0], t[0]); */              /*  458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\
+/* sqr(t[0], t[0]); */              /*  459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\
+sqr_n_mul(out, t[0], 3, t[1]);      /*  460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\
+} while(0)
diff --git a/crypto/blst_src/recip.c b/crypto/blst_src/recip.c
new file mode 100644
index 00000000000..e0c700635ed
--- /dev/null
+++ b/crypto/blst_src/recip.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+#ifdef __OPTIMIZE_SIZE__
+/*
+ * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32%
+ * more than corresponding optimal addition-chain, plus mispredicted
+ * branch penalties on top of that... The addition chain below was
+ * measured to be >50% faster.
+ */
+static void flt_reciprocal_fp(vec384 out, const vec384 inp)
+{
+    static const byte BLS12_381_P_minus_2[] = {
+        TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff),
+        TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf),
+        TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a)
+    };
+
+    exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0);
+}
+#else
+# define sqr(ret,a)		sqr_fp(ret,a)
+# define mul(ret,a,b)		mul_fp(ret,a,b)
+# define sqr_n_mul(ret,a,n,b)	sqr_n_mul_fp(ret,a,n,b)
+
+# include "recip-addchain.h"
+static void flt_reciprocal_fp(vec384 out, const vec384 inp)
+{
+    RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384);
+}
+# undef RECIPROCAL_MOD_BLS12_381_P
+# undef sqr_n_mul
+# undef mul
+# undef sqr
+#endif
+
+static void flt_reciprocal_fp2(vec384x out, const vec384x inp)
+{
+    vec384 t0, t1;
+
+    /*
+     * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
+     */
+    sqr_fp(t0, inp[0]);
+    sqr_fp(t1, inp[1]);
+    add_fp(t0, t0, t1);
+    flt_reciprocal_fp(t1, t0);
+    mul_fp(out[0], inp[0], t1);
+    mul_fp(out[1], inp[1], t1);
+    neg_fp(out[1], out[1]);
+}
+
+static void reciprocal_fp(vec384 out, const vec384 inp)
+{
+    static const vec384 Px8 = {    /* left-aligned value of the modulus */
+        TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd),
+        TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb),
+        TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2)
+    };
+#ifdef __BLST_NO_ASM__
+# define RRx4 BLS12_381_RR
+#else
+    static const vec384 RRx4 = {   /* (4<<768)%P */
+        TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8),
+        TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983),
+        TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175)
+    };
+#endif
+    union { vec768 x; vec384 r[2]; } temp;
+
+    ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8);
+    redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0);
+    mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0);
+
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    /* sign goes straight to flt_reciprocal */
+    mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0);
+    if (vec_is_equal(temp.r[1],  BLS12_381_Rx.p, sizeof(vec384)) |
+        vec_is_zero(temp.r[1], sizeof(vec384)))
+        vec_copy(out, temp.r[0], sizeof(vec384));
+    else
+        flt_reciprocal_fp(out, inp);
+#else
+    vec_copy(out, temp.r[0], sizeof(vec384));
+#endif
+#undef RRx4
+}
+
+void blst_fp_inverse(vec384 out, const vec384 inp)
+{   reciprocal_fp(out, inp);   }
+
+void blst_fp_eucl_inverse(vec384 ret, const vec384 a)
+{   reciprocal_fp(ret, a);   }
+
+static void reciprocal_fp2(vec384x out, const vec384x inp)
+{
+    vec384 t0, t1;
+
+    /*
+     * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i
+     */
+    sqr_fp(t0, inp[0]);
+    sqr_fp(t1, inp[1]);
+    add_fp(t0, t0, t1);
+    reciprocal_fp(t1, t0);
+    mul_fp(out[0], inp[0], t1);
+    mul_fp(out[1], inp[1], t1);
+    neg_fp(out[1], out[1]);
+}
+
+void blst_fp2_inverse(vec384x out, const vec384x inp)
+{   reciprocal_fp2(out, inp);   }
+
+void blst_fp2_eucl_inverse(vec384x out, const vec384x inp)
+{   reciprocal_fp2(out, inp);   }
+
+static void reciprocal_fr(vec256 out, const vec256 inp)
+{
+    static const vec256 rx2 = { /* left-aligned value of the modulus */
+        TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd),
+        TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90),
+    };
+    vec512 temp;
+
+    ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2);
+    redc_mont_256(out, temp, BLS12_381_r, r0);
+    mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void blst_fr_inverse(vec256 out, const vec256 inp)
+{   reciprocal_fr(out, inp);   }
+
+void blst_fr_eucl_inverse(vec256 out, const vec256 inp)
+{   reciprocal_fr(out, inp);   }
diff --git a/crypto/blst_src/server.c b/crypto/blst_src/server.c
new file mode 100644
index 00000000000..c124bcec078
--- /dev/null
+++ b/crypto/blst_src/server.c
@@ -0,0 +1,27 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "hash_to_field.c"
+#include "e1.c"
+#include "map_to_g1.c"
+#include "e2.c"
+#include "map_to_g2.c"
+#include "fp12_tower.c"
+#include "pairing.c"
+#include "aggregate.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "bulk_addition.c"
+#include "multi_scalar.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
+#include "rb_tree.c"
+#ifdef BLST_FR_PENTAROOT
+# include "pentaroot.c"
+#endif
diff --git a/crypto/blst_src/sha256.h b/crypto/blst_src/sha256.h
new file mode 100644
index 00000000000..77ddb6dc848
--- /dev/null
+++ b/crypto/blst_src/sha256.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_SHA256_H__
+#define __BLS12_381_ASM_SHA256_H__
+
+#include "vect.h"
+
+#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \
+     defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__)
+# define sha256_block_data_order blst_sha256_block_data_order_shaext
+#elif defined(__aarch64__) && \
+      defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__)
+# define sha256_block_data_order blst_sha256_block_armv8
+#else
+# define sha256_block_data_order blst_sha256_block_data_order
+#endif
+#define sha256_hcopy blst_sha256_hcopy
+#define sha256_bcopy blst_sha256_bcopy
+#define sha256_emit  blst_sha256_emit
+
+void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks);
+void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]);
+void sha256_bcopy(void *dst, const void *src, size_t len);
+
+/*
+ * If SHA256_CTX conflicts with something, just redefine it to alternative
+ * custom name prior including this header.
+ */
+typedef struct {
+    unsigned int h[8];
+    unsigned long long N;
+    unsigned char buf[64];
+    size_t off;
+} SHA256_CTX;
+
+
+static void sha256_init_h(unsigned int h[8])
+{
+    h[0] = 0x6a09e667U;
+    h[1] = 0xbb67ae85U;
+    h[2] = 0x3c6ef372U;
+    h[3] = 0xa54ff53aU;
+    h[4] = 0x510e527fU;
+    h[5] = 0x9b05688cU;
+    h[6] = 0x1f83d9abU;
+    h[7] = 0x5be0cd19U;
+}
+
+static void sha256_init(SHA256_CTX *ctx)
+{
+    sha256_init_h(ctx->h);
+    ctx->N = 0;
+    vec_zero(ctx->buf, sizeof(ctx->buf));
+    ctx->off = 0;
+}
+
+static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len)
+{
+    size_t n;
+    const unsigned char *inp = _inp;
+
+    ctx->N += len;
+
+    if ((len != 0) & ((n = ctx->off) != 0)) {
+        size_t rem = sizeof(ctx->buf) - n;
+
+        if (rem > len) {
+            sha256_bcopy(ctx->buf + n, inp, len);
+            ctx->off += len;
+            return;
+        } else {
+            sha256_bcopy(ctx->buf + n, inp, rem);
+            inp += rem;
+            len -= rem;
+            sha256_block_data_order(ctx->h, ctx->buf, 1);
+            vec_zero(ctx->buf, sizeof(ctx->buf));
+            ctx->off = 0;
+        }
+    }
+
+    n = len / sizeof(ctx->buf);
+    if (n > 0) {
+        sha256_block_data_order(ctx->h, inp, n);
+        n *= sizeof(ctx->buf);
+        inp += n;
+        len -= n;
+    }
+
+    if (len)
+        sha256_bcopy(ctx->buf, inp, ctx->off = len);
+}
+
+#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \
+                            (ptr)[1] = (unsigned char)((val)>>16), \
+                            (ptr)[2] = (unsigned char)((val)>>8),  \
+                            (ptr)[3] = (unsigned char)(val))
+
+#if 1
+void sha256_emit(unsigned char md[32], const unsigned int h[8]);
+#else
+static void sha256_emit(unsigned char md[32], const unsigned int h[8])
+{
+    unsigned int h_i;
+
+    h_i = h[0]; __TOBE32(md + 0, h_i);
+    h_i = h[1]; __TOBE32(md + 4, h_i);
+    h_i = h[2]; __TOBE32(md + 8, h_i);
+    h_i = h[3]; __TOBE32(md + 12, h_i);
+    h_i = h[4]; __TOBE32(md + 16, h_i);
+    h_i = h[5]; __TOBE32(md + 20, h_i);
+    h_i = h[6]; __TOBE32(md + 24, h_i);
+    h_i = h[7]; __TOBE32(md + 28, h_i);
+}
+#endif
+
+static void sha256_final(unsigned char md[32], SHA256_CTX *ctx)
+{
+    unsigned long long bits = ctx->N * 8;
+    size_t n = ctx->off;
+    unsigned char *tail;
+
+    ctx->buf[n++] = 0x80;
+
+    if (n > (sizeof(ctx->buf) - 8)) {
+        sha256_block_data_order(ctx->h, ctx->buf, 1);
+        vec_zero(ctx->buf, sizeof(ctx->buf));
+    }
+
+    tail = ctx->buf + sizeof(ctx->buf) - 8;
+    __TOBE32(tail, (unsigned int)(bits >> 32));
+    __TOBE32(tail + 4, (unsigned int)bits);
+    sha256_block_data_order(ctx->h, ctx->buf, 1);
+    sha256_emit(md, ctx->h);
+}
+
+#undef __TOBE32
+#endif
diff --git a/crypto/blst_src/sqrt-addchain.h b/crypto/blst_src/sqrt-addchain.h
new file mode 100644
index 00000000000..4e7f0beb6b1
--- /dev/null
+++ b/crypto/blst_src/sqrt-addchain.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/*
+ * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which
+ * yields reciprocal of sqrt(x), which is used in simplified Shallue-
+ * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt
+ * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x)
+ * as 'x*ret^2==1').
+ *
+ * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946'
+ * https://github.com/kwantam/addchain
+ *
+ * # Bos-Coster (win=4)           :  458 (16) <<<
+ * # Bos-Coster (win=5)           :  460 (28)
+ * # Bos-Coster (win=6)           :  461 (33)
+ * # Bos-Coster (win=7)           :  460 (28)
+ * # Bos-Coster (win=3)           :  462 ( 9)
+ * # Bos-Coster (win=8)           :  466 (34)
+ * # Bos-Coster (win=9)           :  464 (31)
+ * # Yacobi                       :  478 (31)
+ * # Bos-Coster (win=10)          :  473 (30)
+ * # Bos-Coster (win=2)           :  486 ( 5)
+ * # Bergeron-Berstel-Brlek-Duboc :  489 ( 5)
+ */
+
+#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \
+ptype t[16]; \
+vec_copy(t[13], inp, sizeof(ptype));/*    0: 1 */\
+sqr(t[0], t[13]);                   /*    1: 2 */\
+mul(t[8], t[0], t[13]);             /*    2: 3 */\
+sqr(t[4], t[0]);                    /*    3: 4 */\
+mul(t[1], t[8], t[0]);              /*    4: 5 */\
+mul(t[6], t[4], t[8]);              /*    5: 7 */\
+mul(t[9], t[1], t[4]);              /*    6: 9 */\
+mul(t[12], t[6], t[4]);             /*    7: b */\
+mul(t[3], t[9], t[4]);              /*    8: d */\
+mul(t[7], t[12], t[4]);             /*    9: f */\
+mul(t[15], t[3], t[4]);             /*   10: 11 */\
+mul(t[10], t[7], t[4]);             /*   11: 13 */\
+mul(t[2], t[15], t[4]);             /*   12: 15 */\
+mul(t[11], t[10], t[4]);            /*   13: 17 */\
+sqr(t[0], t[3]);                    /*   14: 1a */\
+mul(t[14], t[11], t[4]);            /*   15: 1b */\
+mul(t[5], t[0], t[8]);              /*   16: 1d */\
+mul(t[4], t[0], t[1]);              /*   17: 1f */\
+/* sqr(t[0], t[0]); */              /*   18: 34 */\
+/* sqr(t[0], t[0]); */              /*   19: 68 */\
+/* sqr(t[0], t[0]); */              /*   20: d0 */\
+/* sqr(t[0], t[0]); */              /*   21: 1a0 */\
+/* sqr(t[0], t[0]); */              /*   22: 340 */\
+/* sqr(t[0], t[0]); */              /*   23: 680 */\
+/* sqr(t[0], t[0]); */              /*   24: d00 */\
+/* sqr(t[0], t[0]); */              /*   25: 1a00 */\
+/* sqr(t[0], t[0]); */              /*   26: 3400 */\
+/* sqr(t[0], t[0]); */              /*   27: 6800 */\
+/* sqr(t[0], t[0]); */              /*   28: d000 */\
+/* sqr(t[0], t[0]); */              /*   29: 1a000 */\
+sqr_n_mul(t[0], t[0], 12, t[15]);   /*   30: 1a011 */\
+/* sqr(t[0], t[0]); */              /*   31: 34022 */\
+/* sqr(t[0], t[0]); */              /*   32: 68044 */\
+/* sqr(t[0], t[0]); */              /*   33: d0088 */\
+/* sqr(t[0], t[0]); */              /*   34: 1a0110 */\
+/* sqr(t[0], t[0]); */              /*   35: 340220 */\
+/* sqr(t[0], t[0]); */              /*   36: 680440 */\
+/* sqr(t[0], t[0]); */              /*   37: d00880 */\
+sqr_n_mul(t[0], t[0], 7, t[7]);     /*   38: d0088f */\
+/* sqr(t[0], t[0]); */              /*   39: 1a0111e */\
+/* sqr(t[0], t[0]); */              /*   40: 340223c */\
+/* sqr(t[0], t[0]); */              /*   41: 6804478 */\
+/* sqr(t[0], t[0]); */              /*   42: d0088f0 */\
+sqr_n_mul(t[0], t[0], 4, t[1]);     /*   43: d0088f5 */\
+/* sqr(t[0], t[0]); */              /*   44: 1a0111ea */\
+/* sqr(t[0], t[0]); */              /*   45: 340223d4 */\
+/* sqr(t[0], t[0]); */              /*   46: 680447a8 */\
+/* sqr(t[0], t[0]); */              /*   47: d0088f50 */\
+/* sqr(t[0], t[0]); */              /*   48: 1a0111ea0 */\
+/* sqr(t[0], t[0]); */              /*   49: 340223d40 */\
+sqr_n_mul(t[0], t[0], 6, t[6]);     /*   50: 340223d47 */\
+/* sqr(t[0], t[0]); */              /*   51: 680447a8e */\
+/* sqr(t[0], t[0]); */              /*   52: d0088f51c */\
+/* sqr(t[0], t[0]); */              /*   53: 1a0111ea38 */\
+/* sqr(t[0], t[0]); */              /*   54: 340223d470 */\
+/* sqr(t[0], t[0]); */              /*   55: 680447a8e0 */\
+/* sqr(t[0], t[0]); */              /*   56: d0088f51c0 */\
+/* sqr(t[0], t[0]); */              /*   57: 1a0111ea380 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*   58: 1a0111ea397 */\
+/* sqr(t[0], t[0]); */              /*   59: 340223d472e */\
+/* sqr(t[0], t[0]); */              /*   60: 680447a8e5c */\
+/* sqr(t[0], t[0]); */              /*   61: d0088f51cb8 */\
+/* sqr(t[0], t[0]); */              /*   62: 1a0111ea3970 */\
+/* sqr(t[0], t[0]); */              /*   63: 340223d472e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*   64: 340223d472ff */\
+/* sqr(t[0], t[0]); */              /*   65: 680447a8e5fe */\
+/* sqr(t[0], t[0]); */              /*   66: d0088f51cbfc */\
+sqr_n_mul(t[0], t[0], 2, t[8]);     /*   67: d0088f51cbff */\
+/* sqr(t[0], t[0]); */              /*   68: 1a0111ea397fe */\
+/* sqr(t[0], t[0]); */              /*   69: 340223d472ffc */\
+/* sqr(t[0], t[0]); */              /*   70: 680447a8e5ff8 */\
+/* sqr(t[0], t[0]); */              /*   71: d0088f51cbff0 */\
+/* sqr(t[0], t[0]); */              /*   72: 1a0111ea397fe0 */\
+/* sqr(t[0], t[0]); */              /*   73: 340223d472ffc0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*   74: 340223d472ffcd */\
+/* sqr(t[0], t[0]); */              /*   75: 680447a8e5ff9a */\
+/* sqr(t[0], t[0]); */              /*   76: d0088f51cbff34 */\
+/* sqr(t[0], t[0]); */              /*   77: 1a0111ea397fe68 */\
+/* sqr(t[0], t[0]); */              /*   78: 340223d472ffcd0 */\
+/* sqr(t[0], t[0]); */              /*   79: 680447a8e5ff9a0 */\
+/* sqr(t[0], t[0]); */              /*   80: d0088f51cbff340 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*   81: d0088f51cbff34d */\
+/* sqr(t[0], t[0]); */              /*   82: 1a0111ea397fe69a */\
+/* sqr(t[0], t[0]); */              /*   83: 340223d472ffcd34 */\
+/* sqr(t[0], t[0]); */              /*   84: 680447a8e5ff9a68 */\
+/* sqr(t[0], t[0]); */              /*   85: d0088f51cbff34d0 */\
+/* sqr(t[0], t[0]); */              /*   86: 1a0111ea397fe69a0 */\
+/* sqr(t[0], t[0]); */              /*   87: 340223d472ffcd340 */\
+sqr_n_mul(t[0], t[0], 6, t[9]);     /*   88: 340223d472ffcd349 */\
+/* sqr(t[0], t[0]); */              /*   89: 680447a8e5ff9a692 */\
+/* sqr(t[0], t[0]); */              /*   90: d0088f51cbff34d24 */\
+/* sqr(t[0], t[0]); */              /*   91: 1a0111ea397fe69a48 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*   92: 1a0111ea397fe69a4b */\
+/* sqr(t[0], t[0]); */              /*   93: 340223d472ffcd3496 */\
+/* sqr(t[0], t[0]); */              /*   94: 680447a8e5ff9a692c */\
+/* sqr(t[0], t[0]); */              /*   95: d0088f51cbff34d258 */\
+/* sqr(t[0], t[0]); */              /*   96: 1a0111ea397fe69a4b0 */\
+/* sqr(t[0], t[0]); */              /*   97: 340223d472ffcd34960 */\
+/* sqr(t[0], t[0]); */              /*   98: 680447a8e5ff9a692c0 */\
+/* sqr(t[0], t[0]); */              /*   99: d0088f51cbff34d2580 */\
+sqr_n_mul(t[0], t[0], 7, t[3]);     /*  100: d0088f51cbff34d258d */\
+/* sqr(t[0], t[0]); */              /*  101: 1a0111ea397fe69a4b1a */\
+/* sqr(t[0], t[0]); */              /*  102: 340223d472ffcd349634 */\
+/* sqr(t[0], t[0]); */              /*  103: 680447a8e5ff9a692c68 */\
+/* sqr(t[0], t[0]); */              /*  104: d0088f51cbff34d258d0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  105: d0088f51cbff34d258dd */\
+/* sqr(t[0], t[0]); */              /*  106: 1a0111ea397fe69a4b1ba */\
+/* sqr(t[0], t[0]); */              /*  107: 340223d472ffcd3496374 */\
+/* sqr(t[0], t[0]); */              /*  108: 680447a8e5ff9a692c6e8 */\
+/* sqr(t[0], t[0]); */              /*  109: d0088f51cbff34d258dd0 */\
+/* sqr(t[0], t[0]); */              /*  110: 1a0111ea397fe69a4b1ba0 */\
+/* sqr(t[0], t[0]); */              /*  111: 340223d472ffcd34963740 */\
+sqr_n_mul(t[0], t[0], 6, t[7]);     /*  112: 340223d472ffcd3496374f */\
+/* sqr(t[0], t[0]); */              /*  113: 680447a8e5ff9a692c6e9e */\
+/* sqr(t[0], t[0]); */              /*  114: d0088f51cbff34d258dd3c */\
+/* sqr(t[0], t[0]); */              /*  115: 1a0111ea397fe69a4b1ba78 */\
+/* sqr(t[0], t[0]); */              /*  116: 340223d472ffcd3496374f0 */\
+/* sqr(t[0], t[0]); */              /*  117: 680447a8e5ff9a692c6e9e0 */\
+/* sqr(t[0], t[0]); */              /*  118: d0088f51cbff34d258dd3c0 */\
+sqr_n_mul(t[0], t[0], 6, t[14]);    /*  119: d0088f51cbff34d258dd3db */\
+/* sqr(t[0], t[0]); */              /*  120: 1a0111ea397fe69a4b1ba7b6 */\
+/* sqr(t[0], t[0]); */              /*  121: 340223d472ffcd3496374f6c */\
+/* sqr(t[0], t[0]); */              /*  122: 680447a8e5ff9a692c6e9ed8 */\
+sqr_n_mul(t[0], t[0], 3, t[13]);    /*  123: 680447a8e5ff9a692c6e9ed9 */\
+/* sqr(t[0], t[0]); */              /*  124: d0088f51cbff34d258dd3db2 */\
+/* sqr(t[0], t[0]); */              /*  125: 1a0111ea397fe69a4b1ba7b64 */\
+/* sqr(t[0], t[0]); */              /*  126: 340223d472ffcd3496374f6c8 */\
+/* sqr(t[0], t[0]); */              /*  127: 680447a8e5ff9a692c6e9ed90 */\
+/* sqr(t[0], t[0]); */              /*  128: d0088f51cbff34d258dd3db20 */\
+/* sqr(t[0], t[0]); */              /*  129: 1a0111ea397fe69a4b1ba7b640 */\
+/* sqr(t[0], t[0]); */              /*  130: 340223d472ffcd3496374f6c80 */\
+/* sqr(t[0], t[0]); */              /*  131: 680447a8e5ff9a692c6e9ed900 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  132: 680447a8e5ff9a692c6e9ed90d */\
+/* sqr(t[0], t[0]); */              /*  133: d0088f51cbff34d258dd3db21a */\
+/* sqr(t[0], t[0]); */              /*  134: 1a0111ea397fe69a4b1ba7b6434 */\
+/* sqr(t[0], t[0]); */              /*  135: 340223d472ffcd3496374f6c868 */\
+/* sqr(t[0], t[0]); */              /*  136: 680447a8e5ff9a692c6e9ed90d0 */\
+/* sqr(t[0], t[0]); */              /*  137: d0088f51cbff34d258dd3db21a0 */\
+/* sqr(t[0], t[0]); */              /*  138: 1a0111ea397fe69a4b1ba7b64340 */\
+/* sqr(t[0], t[0]); */              /*  139: 340223d472ffcd3496374f6c8680 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*  140: 340223d472ffcd3496374f6c8697 */\
+/* sqr(t[0], t[0]); */              /*  141: 680447a8e5ff9a692c6e9ed90d2e */\
+/* sqr(t[0], t[0]); */              /*  142: d0088f51cbff34d258dd3db21a5c */\
+/* sqr(t[0], t[0]); */              /*  143: 1a0111ea397fe69a4b1ba7b6434b8 */\
+/* sqr(t[0], t[0]); */              /*  144: 340223d472ffcd3496374f6c86970 */\
+/* sqr(t[0], t[0]); */              /*  145: 680447a8e5ff9a692c6e9ed90d2e0 */\
+sqr_n_mul(t[0], t[0], 5, t[12]);    /*  146: 680447a8e5ff9a692c6e9ed90d2eb */\
+/* sqr(t[0], t[0]); */              /*  147: d0088f51cbff34d258dd3db21a5d6 */\
+/* sqr(t[0], t[0]); */              /*  148: 1a0111ea397fe69a4b1ba7b6434bac */\
+/* sqr(t[0], t[0]); */              /*  149: 340223d472ffcd3496374f6c869758 */\
+/* sqr(t[0], t[0]); */              /*  150: 680447a8e5ff9a692c6e9ed90d2eb0 */\
+/* sqr(t[0], t[0]); */              /*  151: d0088f51cbff34d258dd3db21a5d60 */\
+/* sqr(t[0], t[0]); */              /*  152: 1a0111ea397fe69a4b1ba7b6434bac0 */\
+sqr_n_mul(t[0], t[0], 6, t[3]);     /*  153: 1a0111ea397fe69a4b1ba7b6434bacd */\
+/* sqr(t[0], t[0]); */              /*  154: 340223d472ffcd3496374f6c869759a */\
+/* sqr(t[0], t[0]); */              /*  155: 680447a8e5ff9a692c6e9ed90d2eb34 */\
+/* sqr(t[0], t[0]); */              /*  156: d0088f51cbff34d258dd3db21a5d668 */\
+/* sqr(t[0], t[0]); */              /*  157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\
+/* sqr(t[0], t[0]); */              /*  158: 340223d472ffcd3496374f6c869759a0 */\
+/* sqr(t[0], t[0]); */              /*  159: 680447a8e5ff9a692c6e9ed90d2eb340 */\
+sqr_n_mul(t[0], t[0], 6, t[5]);     /*  160: 680447a8e5ff9a692c6e9ed90d2eb35d */\
+/* sqr(t[0], t[0]); */              /*  161: d0088f51cbff34d258dd3db21a5d66ba */\
+/* sqr(t[0], t[0]); */              /*  162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\
+/* sqr(t[0], t[0]); */              /*  163: 340223d472ffcd3496374f6c869759ae8 */\
+/* sqr(t[0], t[0]); */              /*  164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\
+sqr_n_mul(t[0], t[0], 4, t[9]);     /*  165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\
+/* sqr(t[0], t[0]); */              /*  166: d0088f51cbff34d258dd3db21a5d66bb2 */\
+/* sqr(t[0], t[0]); */              /*  167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\
+/* sqr(t[0], t[0]); */              /*  168: 340223d472ffcd3496374f6c869759aec8 */\
+/* sqr(t[0], t[0]); */              /*  169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\
+/* sqr(t[0], t[0]); */              /*  170: d0088f51cbff34d258dd3db21a5d66bb20 */\
+/* sqr(t[0], t[0]); */              /*  171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\
+/* sqr(t[0], t[0]); */              /*  172: 340223d472ffcd3496374f6c869759aec80 */\
+/* sqr(t[0], t[0]); */              /*  173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\
+sqr_n_mul(t[0], t[0], 8, t[5]);     /*  174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\
+/* sqr(t[0], t[0]); */              /*  175: d0088f51cbff34d258dd3db21a5d66bb23a */\
+/* sqr(t[0], t[0]); */              /*  176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\
+/* sqr(t[0], t[0]); */              /*  177: 340223d472ffcd3496374f6c869759aec8e8 */\
+/* sqr(t[0], t[0]); */              /*  178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\
+/* sqr(t[0], t[0]); */              /*  180: d0088f51cbff34d258dd3db21a5d66bb23ba */\
+/* sqr(t[0], t[0]); */              /*  181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\
+/* sqr(t[0], t[0]); */              /*  182: 340223d472ffcd3496374f6c869759aec8ee8 */\
+/* sqr(t[0], t[0]); */              /*  183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\
+/* sqr(t[0], t[0]); */              /*  184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\
+/* sqr(t[0], t[0]); */              /*  185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\
+/* sqr(t[0], t[0]); */              /*  186: 340223d472ffcd3496374f6c869759aec8ee80 */\
+sqr_n_mul(t[0], t[0], 7, t[11]);    /*  187: 340223d472ffcd3496374f6c869759aec8ee97 */\
+/* sqr(t[0], t[0]); */              /*  188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\
+/* sqr(t[0], t[0]); */              /*  189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\
+/* sqr(t[0], t[0]); */              /*  190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\
+/* sqr(t[0], t[0]); */              /*  191: 340223d472ffcd3496374f6c869759aec8ee970 */\
+/* sqr(t[0], t[0]); */              /*  192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\
+/* sqr(t[0], t[0]); */              /*  193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\
+/* sqr(t[0], t[0]); */              /*  194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\
+/* sqr(t[0], t[0]); */              /*  195: 340223d472ffcd3496374f6c869759aec8ee9700 */\
+/* sqr(t[0], t[0]); */              /*  196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\
+sqr_n_mul(t[0], t[0], 9, t[10]);    /*  197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\
+/* sqr(t[0], t[0]); */              /*  198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\
+/* sqr(t[0], t[0]); */              /*  199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\
+sqr_n_mul(t[0], t[0], 2, t[8]);     /*  200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\
+/* sqr(t[0], t[0]); */              /*  201: 340223d472ffcd3496374f6c869759aec8ee9709e */\
+/* sqr(t[0], t[0]); */              /*  202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\
+/* sqr(t[0], t[0]); */              /*  203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\
+/* sqr(t[0], t[0]); */              /*  204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\
+/* sqr(t[0], t[0]); */              /*  205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\
+sqr_n_mul(t[0], t[0], 5, t[6]);     /*  206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\
+/* sqr(t[0], t[0]); */              /*  207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\
+/* sqr(t[0], t[0]); */              /*  208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\
+/* sqr(t[0], t[0]); */              /*  209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\
+/* sqr(t[0], t[0]); */              /*  210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\
+/* sqr(t[0], t[0]); */              /*  211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\
+/* sqr(t[0], t[0]); */              /*  212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\
+/* sqr(t[0], t[0]); */              /*  213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\
+sqr_n_mul(t[0], t[0], 7, t[1]);     /*  214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\
+/* sqr(t[0], t[0]); */              /*  215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\
+/* sqr(t[0], t[0]); */              /*  216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\
+/* sqr(t[0], t[0]); */              /*  217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\
+/* sqr(t[0], t[0]); */              /*  218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\
+/* sqr(t[0], t[0]); */              /*  219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\
+/* sqr(t[0], t[0]); */              /*  220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\
+/* sqr(t[0], t[0]); */              /*  221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\
+sqr_n_mul(t[0], t[0], 7, t[9]);     /*  222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\
+/* sqr(t[0], t[0]); */              /*  223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\
+/* sqr(t[0], t[0]); */              /*  224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\
+/* sqr(t[0], t[0]); */              /*  225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\
+/* sqr(t[0], t[0]); */              /*  226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\
+/* sqr(t[0], t[0]); */              /*  227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\
+/* sqr(t[0], t[0]); */              /*  228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\
+sqr_n_mul(t[0], t[0], 6, t[11]);    /*  229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\
+/* sqr(t[0], t[0]); */              /*  230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\
+/* sqr(t[0], t[0]); */              /*  231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\
+/* sqr(t[0], t[0]); */              /*  232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\
+/* sqr(t[0], t[0]); */              /*  233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\
+/* sqr(t[0], t[0]); */              /*  234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\
+/* sqr(t[0], t[0]); */              /*  236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\
+/* sqr(t[0], t[0]); */              /*  237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\
+/* sqr(t[0], t[0]); */              /*  238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\
+/* sqr(t[0], t[0]); */              /*  239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\
+/* sqr(t[0], t[0]); */              /*  240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[10]);    /*  241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\
+/* sqr(t[0], t[0]); */              /*  242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\
+/* sqr(t[0], t[0]); */              /*  243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\
+/* sqr(t[0], t[0]); */              /*  244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\
+/* sqr(t[0], t[0]); */              /*  245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\
+/* sqr(t[0], t[0]); */              /*  246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\
+sqr_n_mul(t[0], t[0], 5, t[10]);    /*  247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\
+/* sqr(t[0], t[0]); */              /*  248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\
+/* sqr(t[0], t[0]); */              /*  249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\
+/* sqr(t[0], t[0]); */              /*  250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\
+/* sqr(t[0], t[0]); */              /*  251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\
+/* sqr(t[0], t[0]); */              /*  252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\
+/* sqr(t[0], t[0]); */              /*  253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\
+/* sqr(t[0], t[0]); */              /*  254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\
+/* sqr(t[0], t[0]); */              /*  255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\
+sqr_n_mul(t[0], t[0], 8, t[3]);     /*  256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\
+/* sqr(t[0], t[0]); */              /*  257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\
+/* sqr(t[0], t[0]); */              /*  258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\
+/* sqr(t[0], t[0]); */              /*  259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\
+/* sqr(t[0], t[0]); */              /*  260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\
+/* sqr(t[0], t[0]); */              /*  261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\
+/* sqr(t[0], t[0]); */              /*  262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\
+/* sqr(t[0], t[0]); */              /*  263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\
+sqr_n_mul(t[0], t[0], 7, t[2]);     /*  264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\
+/* sqr(t[0], t[0]); */              /*  265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\
+/* sqr(t[0], t[0]); */              /*  266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\
+/* sqr(t[0], t[0]); */              /*  267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\
+/* sqr(t[0], t[0]); */              /*  268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\
+/* sqr(t[0], t[0]); */              /*  269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\
+/* sqr(t[0], t[0]); */              /*  270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\
+/* sqr(t[0], t[0]); */              /*  271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\
+/* sqr(t[0], t[0]); */              /*  272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\
+/* sqr(t[0], t[0]); */              /*  273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\
+sqr_n_mul(t[0], t[0], 9, t[7]);     /*  274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\
+/* sqr(t[0], t[0]); */              /*  275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\
+/* sqr(t[0], t[0]); */              /*  276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\
+/* sqr(t[0], t[0]); */              /*  277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\
+/* sqr(t[0], t[0]); */              /*  278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\
+/* sqr(t[0], t[0]); */              /*  279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\
+sqr_n_mul(t[0], t[0], 5, t[3]);     /*  280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\
+/* sqr(t[0], t[0]); */              /*  281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\
+/* sqr(t[0], t[0]); */              /*  282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\
+/* sqr(t[0], t[0]); */              /*  283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\
+/* sqr(t[0], t[0]); */              /*  285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\
+/* sqr(t[0], t[0]); */              /*  286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\
+/* sqr(t[0], t[0]); */              /*  287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\
+/* sqr(t[0], t[0]); */              /*  288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\
+/* sqr(t[0], t[0]); */              /*  289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\
+/* sqr(t[0], t[0]); */              /*  290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\
+/* sqr(t[0], t[0]); */              /*  291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\
+/* sqr(t[0], t[0]); */              /*  292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\
+sqr_n_mul(t[0], t[0], 8, t[7]);     /*  293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\
+/* sqr(t[0], t[0]); */              /*  294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\
+/* sqr(t[0], t[0]); */              /*  295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\
+/* sqr(t[0], t[0]); */              /*  296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\
+/* sqr(t[0], t[0]); */              /*  298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\
+/* sqr(t[0], t[0]); */              /*  299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\
+/* sqr(t[0], t[0]); */              /*  300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\
+/* sqr(t[0], t[0]); */              /*  301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\
+/* sqr(t[0], t[0]); */              /*  302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\
+/* sqr(t[0], t[0]); */              /*  303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\
+/* sqr(t[0], t[0]); */              /*  304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\
+sqr_n_mul(t[0], t[0], 7, t[9]);     /*  305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\
+/* sqr(t[0], t[0]); */              /*  306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\
+/* sqr(t[0], t[0]); */              /*  307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\
+/* sqr(t[0], t[0]); */              /*  308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\
+/* sqr(t[0], t[0]); */              /*  309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\
+/* sqr(t[0], t[0]); */              /*  310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\
+/* sqr(t[0], t[0]); */              /*  311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\
+/* sqr(t[0], t[0]); */              /*  312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\
+/* sqr(t[0], t[0]); */              /*  313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\
+/* sqr(t[0], t[0]); */              /*  314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\
+sqr_n_mul(t[0], t[0], 9, t[7]);     /*  315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\
+/* sqr(t[0], t[0]); */              /*  316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\
+/* sqr(t[0], t[0]); */              /*  317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\
+/* sqr(t[0], t[0]); */              /*  318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\
+/* sqr(t[0], t[0]); */              /*  319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\
+/* sqr(t[0], t[0]); */              /*  320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\
+/* sqr(t[0], t[0]); */              /*  321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\
+sqr_n_mul(t[0], t[0], 6, t[2]);     /*  322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\
+/* sqr(t[0], t[0]); */              /*  323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\
+/* sqr(t[0], t[0]); */              /*  324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\
+/* sqr(t[0], t[0]); */              /*  325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\
+/* sqr(t[0], t[0]); */              /*  326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\
+/* sqr(t[0], t[0]); */              /*  327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\
+/* sqr(t[0], t[0]); */              /*  328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\
+sqr_n_mul(t[0], t[0], 6, t[4]);     /*  329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\
+/* sqr(t[0], t[0]); */              /*  330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\
+/* sqr(t[0], t[0]); */              /*  331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\
+/* sqr(t[0], t[0]); */              /*  332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\
+/* sqr(t[0], t[0]); */              /*  333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\
+/* sqr(t[0], t[0]); */              /*  334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\
+/* sqr(t[0], t[0]); */              /*  336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\
+/* sqr(t[0], t[0]); */              /*  337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\
+/* sqr(t[0], t[0]); */              /*  338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\
+/* sqr(t[0], t[0]); */              /*  339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\
+/* sqr(t[0], t[0]); */              /*  340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\
+/* sqr(t[0], t[0]); */              /*  342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\
+/* sqr(t[0], t[0]); */              /*  343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\
+/* sqr(t[0], t[0]); */              /*  344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\
+/* sqr(t[0], t[0]); */              /*  345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\
+/* sqr(t[0], t[0]); */              /*  347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\
+/* sqr(t[0], t[0]); */              /*  348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\
+/* sqr(t[0], t[0]); */              /*  349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\
+sqr_n_mul(t[0], t[0], 3, t[8]);     /*  350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\
+/* sqr(t[0], t[0]); */              /*  351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\
+/* sqr(t[0], t[0]); */              /*  352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\
+/* sqr(t[0], t[0]); */              /*  353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\
+/* sqr(t[0], t[0]); */              /*  354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\
+/* sqr(t[0], t[0]); */              /*  355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\
+/* sqr(t[0], t[0]); */              /*  356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\
+/* sqr(t[0], t[0]); */              /*  357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\
+/* sqr(t[0], t[0]); */              /*  358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\
+sqr_n_mul(t[0], t[0], 8, t[2]);     /*  359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\
+/* sqr(t[0], t[0]); */              /*  360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\
+/* sqr(t[0], t[0]); */              /*  361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\
+/* sqr(t[0], t[0]); */              /*  362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\
+/* sqr(t[0], t[0]); */              /*  363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\
+/* sqr(t[0], t[0]); */              /*  364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\
+/* sqr(t[0], t[0]); */              /*  365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\
+/* sqr(t[0], t[0]); */              /*  366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\
+/* sqr(t[0], t[0]); */              /*  368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\
+/* sqr(t[0], t[0]); */              /*  369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\
+/* sqr(t[0], t[0]); */              /*  370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\
+/* sqr(t[0], t[0]); */              /*  371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\
+/* sqr(t[0], t[0]); */              /*  372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\
+/* sqr(t[0], t[0]); */              /*  374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\
+/* sqr(t[0], t[0]); */              /*  375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\
+/* sqr(t[0], t[0]); */              /*  376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\
+/* sqr(t[0], t[0]); */              /*  377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\
+/* sqr(t[0], t[0]); */              /*  378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\
+/* sqr(t[0], t[0]); */              /*  380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\
+/* sqr(t[0], t[0]); */              /*  381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\
+/* sqr(t[0], t[0]); */              /*  382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\
+/* sqr(t[0], t[0]); */              /*  383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\
+sqr_n_mul(t[0], t[0], 4, t[7]);     /*  384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\
+/* sqr(t[0], t[0]); */              /*  385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\
+/* sqr(t[0], t[0]); */              /*  386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\
+/* sqr(t[0], t[0]); */              /*  387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\
+/* sqr(t[0], t[0]); */              /*  388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[6]);     /*  389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\
+/* sqr(t[0], t[0]); */              /*  390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\
+/* sqr(t[0], t[0]); */              /*  391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\
+/* sqr(t[0], t[0]); */              /*  392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\
+/* sqr(t[0], t[0]); */              /*  393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\
+/* sqr(t[0], t[0]); */              /*  394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\
+/* sqr(t[0], t[0]); */              /*  395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\
+/* sqr(t[0], t[0]); */              /*  396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\
+sqr_n_mul(t[0], t[0], 7, t[4]);     /*  397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\
+/* sqr(t[0], t[0]); */              /*  398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\
+/* sqr(t[0], t[0]); */              /*  399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\
+/* sqr(t[0], t[0]); */              /*  400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\
+/* sqr(t[0], t[0]); */              /*  401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\
+/* sqr(t[0], t[0]); */              /*  402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\
+sqr_n_mul(t[0], t[0], 5, t[5]);     /*  403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\
+/* sqr(t[0], t[0]); */              /*  404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\
+/* sqr(t[0], t[0]); */              /*  405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\
+/* sqr(t[0], t[0]); */              /*  406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\
+/* sqr(t[0], t[0]); */              /*  407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\
+/* sqr(t[0], t[0]); */              /*  408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\
+/* sqr(t[0], t[0]); */              /*  410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\
+/* sqr(t[0], t[0]); */              /*  411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\
+/* sqr(t[0], t[0]); */              /*  412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\
+/* sqr(t[0], t[0]); */              /*  413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\
+/* sqr(t[0], t[0]); */              /*  414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\
+/* sqr(t[0], t[0]); */              /*  416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\
+/* sqr(t[0], t[0]); */              /*  417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\
+/* sqr(t[0], t[0]); */              /*  418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\
+/* sqr(t[0], t[0]); */              /*  419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\
+/* sqr(t[0], t[0]); */              /*  420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\
+/* sqr(t[0], t[0]); */              /*  422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\
+/* sqr(t[0], t[0]); */              /*  423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\
+/* sqr(t[0], t[0]); */              /*  424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\
+/* sqr(t[0], t[0]); */              /*  425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\
+/* sqr(t[0], t[0]); */              /*  426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\
+/* sqr(t[0], t[0]); */              /*  428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\
+/* sqr(t[0], t[0]); */              /*  429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\
+/* sqr(t[0], t[0]); */              /*  430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\
+/* sqr(t[0], t[0]); */              /*  431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\
+/* sqr(t[0], t[0]); */              /*  432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\
+/* sqr(t[0], t[0]); */              /*  434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\
+/* sqr(t[0], t[0]); */              /*  435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\
+/* sqr(t[0], t[0]); */              /*  436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\
+/* sqr(t[0], t[0]); */              /*  437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\
+/* sqr(t[0], t[0]); */              /*  438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\
+sqr_n_mul(t[0], t[0], 5, t[4]);     /*  439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\
+/* sqr(t[0], t[0]); */              /*  440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\
+/* sqr(t[0], t[0]); */              /*  441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\
+/* sqr(t[0], t[0]); */              /*  442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\
+/* sqr(t[0], t[0]); */              /*  443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\
+sqr_n_mul(t[0], t[0], 4, t[3]);     /*  444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\
+/* sqr(t[0], t[0]); */              /*  445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\
+/* sqr(t[0], t[0]); */              /*  446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\
+/* sqr(t[0], t[0]); */              /*  447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\
+/* sqr(t[0], t[0]); */              /*  448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\
+/* sqr(t[0], t[0]); */              /*  449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\
+/* sqr(t[0], t[0]); */              /*  450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\
+sqr_n_mul(t[0], t[0], 6, t[2]);     /*  451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\
+/* sqr(t[0], t[0]); */              /*  452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\
+/* sqr(t[0], t[0]); */              /*  453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\
+/* sqr(t[0], t[0]); */              /*  454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\
+/* sqr(t[0], t[0]); */              /*  455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\
+sqr_n_mul(t[0], t[0], 4, t[1]);     /*  456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\
+sqr(out, t[0]);                     /*  457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\
+} while(0)
diff --git a/crypto/blst_src/sqrt.c b/crypto/blst_src/sqrt.c
new file mode 100644
index 00000000000..cf149fd1124
--- /dev/null
+++ b/crypto/blst_src/sqrt.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "fields.h"
+
+#ifdef __OPTIMIZE_SIZE__
+static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp)
+{
+    static const byte BLS_12_381_P_minus_3_div_4[] = {
+        TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff),
+        TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af),
+        TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6)
+    };
+
+    exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0);
+}
+#else
+# if 1
+/*
+ * "383"-bit variant omits full reductions at the ends of squarings,
+ * which results in up to ~15% improvement. [One can improve further
+ * by omitting full reductions even after multiplications and
+ * performing final reduction at the very end of the chain.]
+ */
+static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count,
+                                const vec384 b)
+{   sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b);   }
+# else
+static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count,
+                         const vec384 b)
+{
+    while(count--) {
+        sqr_fp(out, a);
+        a = out;
+    }
+    mul_fp(out, out, b);
+}
+# endif
+
+# define sqr(ret,a)		sqr_fp(ret,a)
+# define mul(ret,a,b)		mul_fp(ret,a,b)
+# define sqr_n_mul(ret,a,n,b)	sqr_n_mul_fp(ret,a,n,b)
+
+# include "sqrt-addchain.h"
+static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp)
+{
+    RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384);
+}
+# undef RECIP_SQRT_MOD_BLS12_381_P
+
+# undef sqr_n_mul
+# undef sqr
+# undef mul
+#endif
+
+static bool_t recip_sqrt_fp(vec384 out, const vec384 inp)
+{
+    vec384 t0, t1;
+    bool_t ret;
+
+    recip_sqrt_fp_3mod4(t0, inp);
+
+    mul_fp(t1, t0, inp);
+    sqr_fp(t1, t1);
+    ret = vec_is_equal(t1, inp, sizeof(t1));
+    vec_copy(out, t0, sizeof(t0));
+
+    return ret;
+}
+
+static bool_t sqrt_fp(vec384 out, const vec384 inp)
+{
+    vec384 t0, t1;
+    bool_t ret;
+
+    recip_sqrt_fp_3mod4(t0, inp);
+
+    mul_fp(t0, t0, inp);
+    sqr_fp(t1, t0);
+    ret = vec_is_equal(t1, inp, sizeof(t1));
+    vec_copy(out, t0, sizeof(t0));
+
+    return ret;
+}
+
+int blst_fp_sqrt(vec384 out, const vec384 inp)
+{   return (int)sqrt_fp(out, inp);   }
+
+int blst_fp_is_square(const vec384 inp)
+{
+    return (int)ct_is_square_mod_384(inp, BLS12_381_P);
+}
+
+static bool_t sqrt_align_fp2(vec384x out, const vec384x ret,
+                             const vec384x sqrt, const vec384x inp)
+{
+    static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } };
+    static const vec384x sqrt_sqrt_minus_1 = {
+      /*
+       * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)",
+       * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1,
+       * but it pivots into "complex" plane nevertheless...
+       */
+      { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183),
+        TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18),
+        TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) },
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }
+    };
+    static const vec384x sqrt_minus_sqrt_minus_1 = {
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) },
+      { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c),
+        TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7),
+        TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }
+    };
+    vec384x coeff, t0, t1;
+    bool_t is_sqrt, flag;
+
+    /*
+     * Instead of multiple trial squarings we can perform just one
+     * and see if the result is "rotated by multiple of 90°" in
+     * relation to |inp|, and "rotate" |ret| accordingly.
+     */
+    sqr_fp2(t0, sqrt);
+    /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */
+
+    /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */
+    sub_fp2(t1, t0, inp);
+    is_sqrt = vec_is_zero(t1, sizeof(t1));
+    vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff));
+
+    /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */
+    add_fp2(t1, t0, inp);
+    vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */
+    sub_fp(t1[0], t0[0], inp[1]);
+    add_fp(t1[1], t0[1], inp[0]);
+    vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */
+    add_fp(t1[0], t0[0], inp[1]);
+    sub_fp(t1[1], t0[1], inp[0]);
+    vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff),
+               flag = vec_is_zero(t1, sizeof(t1)));
+    is_sqrt |= flag;
+
+    /* actual "rotation" */
+    mul_fp2(out, ret, coeff);
+
+    return is_sqrt;
+}
+
+/*
+ * |inp| = a + b*i
+ */
+static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp,
+                                          const vec384x recip_ZZZ,
+                                          const vec384x magic_ZZZ)
+{
+    vec384 aa, bb, cc;
+    vec384x inp_;
+    bool_t is_sqrt;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    is_sqrt = recip_sqrt_fp(cc, aa);  /* 1/sqrt(a²+b²)                    */
+
+    /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ...    */
+    mul_fp2(inp_, inp, recip_ZZZ);
+    /* ... and adjust |aa| and |cc| accordingly                           */
+    {
+        vec384 za, zc;
+
+        mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²)                   */
+        mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4)         */
+        vec_select(aa, aa, za, sizeof(aa), is_sqrt);
+        vec_select(cc, cc, zc, sizeof(cc), is_sqrt);
+    }
+    vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt);
+
+    mul_fp(aa, aa, cc);               /* sqrt(a²+b²)                      */
+
+    sub_fp(bb, inp_[0], aa);
+    add_fp(aa, inp_[0], aa);
+    vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa)));
+    div_by_2_fp(aa, aa);              /* (a ± sqrt(a²+b²))/2              */
+
+    /* if it says "no sqrt," final "align" will find right one...         */
+    (void)recip_sqrt_fp(out[0], aa);  /* 1/sqrt((a ± sqrt(a²+b²))/2)      */
+
+    div_by_2_fp(out[1], inp_[1]);
+    mul_fp(out[1], out[1], out[0]);   /* b/(2*sqrt((a ± sqrt(a²+b²))/2))  */
+    mul_fp(out[0], out[0], aa);       /* sqrt((a ± sqrt(a²+b²))/2)        */
+
+    /* bound to succeed                                                   */
+    (void)sqrt_align_fp2(out, out, out, inp_);
+
+    mul_fp(out[0], out[0], cc);       /* inverse the result               */
+    mul_fp(out[1], out[1], cc);
+    neg_fp(out[1], out[1]);
+
+    return is_sqrt;
+}
+
+static bool_t sqrt_fp2(vec384x out, const vec384x inp)
+{
+    vec384x ret;
+    vec384 aa, bb;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    /* don't pay attention to return value, final "align" will tell...    */
+    (void)sqrt_fp(aa, aa);            /* sqrt(a²+b²)                      */
+
+    sub_fp(bb, inp[0], aa);
+    add_fp(aa, inp[0], aa);
+    vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa)));
+    div_by_2_fp(aa, aa);              /* (a ± sqrt(a²+b²))/2              */
+
+    /* if it says "no sqrt," final "align" will find right one...         */
+    (void)recip_sqrt_fp(ret[0], aa);  /* 1/sqrt((a ± sqrt(a²+b²))/2)      */
+
+    div_by_2_fp(ret[1], inp[1]);
+    mul_fp(ret[1], ret[1], ret[0]);   /* b/(2*sqrt((a ± sqrt(a²+b²))/2))  */
+    mul_fp(ret[0], ret[0], aa);       /* sqrt((a ± sqrt(a²+b²))/2)        */
+
+    /*
+     * Now see if |ret| is or can be made sqrt(|inp|)...
+     */
+
+    return sqrt_align_fp2(out, ret, ret, inp);
+}
+
+int blst_fp2_sqrt(vec384x out, const vec384x inp)
+{   return (int)sqrt_fp2(out, inp);   }
+
+int blst_fp2_is_square(const vec384x inp)
+{
+    vec384 aa, bb;
+
+    sqr_fp(aa, inp[0]);
+    sqr_fp(bb, inp[1]);
+    add_fp(aa, aa, bb);
+
+    return (int)ct_is_square_mod_384(aa, BLS12_381_P);
+}
diff --git a/crypto/blst_src/vect.c b/crypto/blst_src/vect.c
new file mode 100644
index 00000000000..1834a48fadd
--- /dev/null
+++ b/crypto/blst_src/vect.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "vect.h"
+
+#ifdef __BLST_NO_ASM__
+# include "no_asm.h"
+#endif
+
+/*
+ * Following are some reference C implementations to assist new
+ * assembly modules development, as starting-point stand-ins and for
+ * cross-checking. In order to "polyfil" specific subroutine redefine
+ * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x.
+ */
+
+#ifdef lshift_mod_384
+inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n,
+                           const vec384 mod)
+{
+    while(n--)
+        add_mod_384(ret, a, a, mod), a = ret;
+}
+#endif
+
+#ifdef mul_by_8_mod_384
+inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod)
+{   lshift_mod_384(ret, a, 3, mod);   }
+#endif
+
+#ifdef mul_by_3_mod_384
+inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod)
+{
+    vec384 t;
+
+    add_mod_384(t, a, a, mod);
+    add_mod_384(ret, t, a, mod);
+}
+#endif
+
+#ifdef mul_by_3_mod_384x
+inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod)
+{
+    mul_by_3_mod_384(ret[0], a[0], mod);
+    mul_by_3_mod_384(ret[1], a[1], mod);
+}
+#endif
+
+#ifdef mul_by_8_mod_384x
+inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod)
+{
+    mul_by_8_mod_384(ret[0], a[0], mod);
+    mul_by_8_mod_384(ret[1], a[1], mod);
+}
+#endif
+
+#ifdef mul_by_1_plus_i_mod_384x
+inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a,
+                                     const vec384 mod)
+{
+    vec384 t;
+
+    add_mod_384(t, a[0], a[1], mod);
+    sub_mod_384(ret[0], a[0], a[1], mod);
+    vec_copy(ret[1], t, sizeof(t));
+}
+#endif
+
+#ifdef add_mod_384x
+inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                         const vec384 mod)
+{
+    add_mod_384(ret[0], a[0], b[0], mod);
+    add_mod_384(ret[1], a[1], b[1], mod);
+}
+#endif
+
+#ifdef sub_mod_384x
+inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                         const vec384 mod)
+{
+    sub_mod_384(ret[0], a[0], b[0], mod);
+    sub_mod_384(ret[1], a[1], b[1], mod);
+}
+#endif
+
+#ifdef lshift_mod_384x
+inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n,
+                            const vec384 mod)
+{
+    lshift_mod_384(ret[0], a[0], n, mod);
+    lshift_mod_384(ret[1], a[1], n, mod);
+}
+#endif
+
+#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__))
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                   const vec384 mod, limb_t n0)
+{
+    vec768 t0, t1, t2;
+    vec384 aa, bb;
+
+    mul_384(t0, a[0], b[0]);
+    mul_384(t1, a[1], b[1]);
+
+    add_mod_384(aa, a[0], a[1], mod);
+    add_mod_384(bb, b[0], b[1], mod);
+    mul_384(t2, aa, bb);
+    sub_mod_384x384(t2, t2, t0, mod);
+    sub_mod_384x384(t2, t2, t1, mod);
+
+    sub_mod_384x384(t0, t0, t1, mod);
+
+    redc_mont_384(ret[0], t0, mod, n0);
+    redc_mont_384(ret[1], t2, mod, n0);
+}
+#endif
+
+#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__))
+void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0)
+{
+    vec384 t0, t1;
+
+    add_mod_384(t0, a[0], a[1], mod);
+    sub_mod_384(t1, a[0], a[1], mod);
+
+    mul_mont_384(ret[1], a[0], a[1], mod, n0);
+    add_mod_384(ret[1], ret[1], ret[1], mod);
+
+    mul_mont_384(ret[0], t0, t1, mod, n0);
+}
+#endif
+
+limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi);
+limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient);
+limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient);
+
+/*
+ * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place.
+ */
+static void div_by_zz(limb_t val[])
+{
+    static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000),
+                                 TO_LIMB_T(0xac45a4010001a402) };
+    size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]);
+    limb_t d_lo, d_hi;
+
+    d_lo = zz[zz_len - 2];
+    d_hi = zz[zz_len - 1];
+    for (loop = zz_len, zz_len--; loop--;) {
+        limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi);
+        (void)quot_rem_128(val + loop, zz, q);
+    }
+    /* remainder is in low half of val[], quotient is in high */
+}
+
+/*
+ * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place.
+ */
+static void div_by_z(limb_t val[])
+{
+    static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) };
+    size_t loop, z_len = sizeof(z)/sizeof(z[0]);
+    limb_t d_lo, d_hi;
+
+    d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2];
+    d_hi = z[z_len - 1];
+    for (loop = z_len, z_len--; loop--;) {
+        limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi);
+        (void)quot_rem_64(val + loop, z, q);
+    }
+    /* remainder is in low half of val[], quotient is in high */
+}
diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h
new file mode 100644
index 00000000000..3211c8628cf
--- /dev/null
+++ b/crypto/blst_src/vect.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLS12_381_ASM_VECT_H__
+#define __BLS12_381_ASM_VECT_H__
+
+#include <stddef.h>
+
+#if defined(__x86_64__) || defined(__aarch64__)
+/* These are available even in ILP32 flavours, but even then they are
+ * capable of performing 64-bit operations as efficiently as in *P64. */
+typedef unsigned long long limb_t;
+# define LIMB_T_BITS    64
+
+#elif defined(_WIN64)   /* Win64 is P64 */
+typedef unsigned __int64 limb_t;
+# define LIMB_T_BITS    64
+
+#elif defined(__BLST_NO_ASM__) || defined(__wasm64__)
+typedef unsigned int limb_t;
+# define LIMB_T_BITS    32
+# ifndef __BLST_NO_ASM__
+#  define __BLST_NO_ASM__
+# endif
+
+#else                   /* 32 bits on 32-bit platforms, 64 - on 64-bit */
+typedef unsigned long limb_t;
+#  ifdef _LP64
+#   define LIMB_T_BITS   64
+#  else
+#   define LIMB_T_BITS   32
+#   define __BLST_NO_ASM__
+#  endif
+#endif
+
+/*
+ * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor
+ * knows nothing about sizeof(anything)...
+ */
+#if LIMB_T_BITS == 64
+# define TO_LIMB_T(limb64)     limb64
+#else
+# define TO_LIMB_T(limb64)     (limb_t)limb64,(limb_t)(limb64>>32)
+#endif
+
+#define NLIMBS(bits)   (bits/LIMB_T_BITS)
+
+typedef limb_t vec256[NLIMBS(256)];
+typedef limb_t vec512[NLIMBS(512)];
+typedef limb_t vec384[NLIMBS(384)];
+typedef limb_t vec768[NLIMBS(768)];
+typedef vec384 vec384x[2];      /* 0 is "real" part, 1 is "imaginary" */
+
+typedef unsigned char byte;
+#define TO_BYTES(limb64)    (byte)limb64,(byte)(limb64>>8),\
+                            (byte)(limb64>>16),(byte)(limb64>>24),\
+                            (byte)(limb64>>32),(byte)(limb64>>40),\
+                            (byte)(limb64>>48),(byte)(limb64>>56)
+typedef byte pow256[256/8];
+
+/*
+ * Internal Boolean type, Bolean by value, hence safe to cast to or
+ * reinterpret as 'bool'.
+ */
+typedef limb_t bool_t;
+
+/*
+ * Assembly subroutines...
+ */
+#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\
+                                                 && !defined(__BLST_NO_ASM__)
+# define mul_mont_sparse_256 mulx_mont_sparse_256
+# define sqr_mont_sparse_256 sqrx_mont_sparse_256
+# define from_mont_256 fromx_mont_256
+# define redc_mont_256 redcx_mont_256
+# define mul_mont_384 mulx_mont_384
+# define sqr_mont_384 sqrx_mont_384
+# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384
+# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383
+# define mul_384 mulx_384
+# define sqr_384 sqrx_384
+# define redc_mont_384 redcx_mont_384
+# define from_mont_384 fromx_mont_384
+# define sgn0_pty_mont_384 sgn0x_pty_mont_384
+# define sgn0_pty_mont_384x sgn0x_pty_mont_384x
+# define ct_inverse_mod_383 ctx_inverse_mod_383
+#elif defined(__BLST_NO_ASM__)
+# define ct_inverse_mod_383 ct_inverse_mod_384
+#endif
+
+void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b,
+                         const vec256 p, limb_t n0);
+void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0);
+void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0);
+void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0);
+
+void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p);
+void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p);
+void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p);
+void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p);
+void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p);
+void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p);
+bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p,
+                            const vec256 one);
+limb_t check_mod_256(const pow256 a, const vec256 p);
+limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b,
+                                       const vec256 p);
+limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b,
+                                       const vec256 p);
+
+void vec_prefetch(const void *ptr, size_t len);
+
+void mul_mont_384(vec384 ret, const vec384 a, const vec384 b,
+                  const vec384 p, limb_t n0);
+void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0);
+void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b);
+void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count,
+                        const vec384 p, limb_t n0, const vec384 b);
+
+void mul_384(vec768 ret, const vec384 a, const vec384 b);
+void sqr_384(vec768 ret, const vec384 a);
+void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0);
+void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0);
+limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p);
+limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p);
+
+void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p);
+void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p);
+void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p);
+void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p);
+void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p);
+void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p);
+void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod,
+                                                      const vec384 modx);
+void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
+                                                      const vec256 modx);
+bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod);
+
+#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)
+# define mul_mont_384x mulx_mont_384x
+# define sqr_mont_384x sqrx_mont_384x
+# define sqr_mont_382x sqrx_mont_382x
+# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x
+# define mul_382x mulx_382x
+# define sqr_382x sqrx_382x
+#endif
+
+void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
+                   const vec384 p, limb_t n0);
+void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
+void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
+void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count,
+                         const vec384 p, limb_t n0, const vec384x b);
+void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p);
+void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p);
+
+void add_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                  const vec384 p);
+void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b,
+                  const vec384 p);
+void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p);
+void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b,
+                     const vec384 p);
+void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b,
+                     const vec384 p);
+
+/*
+ * C subroutines
+ */
+static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow,
+                         size_t pow_bits, const vec384 p, limb_t n0);
+static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow,
+                          size_t pow_bits, const vec384 p, limb_t n0);
+static void div_by_zz(limb_t val[]);
+static void div_by_z(limb_t val[]);
+
+#ifdef __UINTPTR_TYPE__
+typedef __UINTPTR_TYPE__ uptr_t;
+#else
+typedef const void *uptr_t;
+#endif
+
+#if !defined(restrict)
+# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define restrict __restrict__
+#  elif defined(_MSC_VER)
+#   define restrict __restrict
+#  else
+#   define restrict
+#  endif
+# endif
+#endif
+
+#if !defined(inline) && !defined(__cplusplus)
+# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define inline __inline__
+#  elif defined(_MSC_VER)
+#   define inline __inline
+#  else
+#   define inline
+#  endif
+# endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+# define launder(var) asm volatile("" : "+r"(var))
+#else
+# define launder(var)
+#endif
+
+static inline bool_t is_bit_set(const byte *v, size_t i)
+{
+    bool_t ret = (v[i/8] >> (i%8)) & 1;
+    launder(ret);
+    return ret;
+}
+
+static inline bool_t byte_is_zero(unsigned char c)
+{
+    limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1);
+    launder(ret);
+    return ret;
+}
+
+static inline bool_t bytes_are_zero(const unsigned char *a, size_t num)
+{
+    unsigned char acc;
+    size_t i;
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= a[i];
+
+    return byte_is_zero(acc);
+}
+
+static inline void vec_cswap(void *restrict a, void *restrict b, size_t num,
+                             bool_t cbit)
+{
+    limb_t ai, *ap = (limb_t *)a;
+    limb_t bi, *bp = (limb_t *)b;
+    limb_t xorm, mask = (limb_t)0 - cbit;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++) {
+        xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask;
+        ap[i] = ai ^ xorm;
+        bp[i] = bi ^ xorm;
+    }
+}
+
+/* ret = bit ? a : b */
+void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a);
+void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a);
+static inline void vec_select(void *ret, const void *a, const void *b,
+                              size_t num, bool_t sel_a)
+{
+    launder(sel_a);
+#ifndef __BLST_NO_ASM__
+    if (num == 32)          vec_select_32(ret, a, b, sel_a);
+    else if (num == 48)     vec_select_48(ret, a, b, sel_a);
+    else if (num == 96)     vec_select_96(ret, a, b, sel_a);
+    else if (num == 144)    vec_select_144(ret, a, b, sel_a);
+    else if (num == 192)    vec_select_192(ret, a, b, sel_a);
+    else if (num == 288)    vec_select_288(ret, a, b, sel_a);
+#else
+    if (0) ;
+#endif
+    else {
+        limb_t bi;
+        volatile limb_t *rp = (limb_t *)ret;
+        const limb_t *ap = (const limb_t *)a;
+        const limb_t *bp = (const limb_t *)b;
+        limb_t xorm, mask = (limb_t)0 - sel_a;
+        size_t i;
+
+        num /= sizeof(limb_t);
+
+        for (i = 0; i < num; i++) {
+            xorm = (ap[i] ^ (bi = bp[i])) & mask;
+            rp[i] = bi ^ xorm;
+        }
+    }
+}
+
+static inline bool_t is_zero(limb_t l)
+{
+    limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1);
+    launder(ret);
+    return ret;
+}
+
+static inline bool_t vec_is_zero(const void *a, size_t num)
+{
+    const limb_t *ap = (const limb_t *)a;
+    limb_t acc;
+    size_t i;
+
+#ifndef __BLST_NO_ASM__
+    bool_t vec_is_zero_16x(const void *a, size_t num);
+    if ((num & 15) == 0)
+        return vec_is_zero_16x(a, num);
+#endif
+
+    num /= sizeof(limb_t);
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= ap[i];
+
+    return is_zero(acc);
+}
+
+static inline bool_t vec_is_equal(const void *a, const void *b, size_t num)
+{
+    const limb_t *ap = (const limb_t *)a;
+    const limb_t *bp = (const limb_t *)b;
+    limb_t acc;
+    size_t i;
+
+#ifndef __BLST_NO_ASM__
+    bool_t vec_is_equal_16x(const void *a, const void *b, size_t num);
+    if ((num & 15) == 0)
+        return vec_is_equal_16x(a, b, num);
+#endif
+
+    num /= sizeof(limb_t);
+
+    for (acc = 0, i = 0; i < num; i++)
+        acc |= ap[i] ^ bp[i];
+
+    return is_zero(acc);
+}
+
+static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag,
+                                 const vec384 p)
+{
+    cneg_mod_384(ret[0], a[0], flag, p);
+    cneg_mod_384(ret[1], a[1], flag, p);
+}
+
+static inline void vec_copy(void *restrict ret, const void *a, size_t num)
+{
+    limb_t *rp = (limb_t *)ret;
+    const limb_t *ap = (const limb_t *)a;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = ap[i];
+}
+
+static inline void vec_zero(void *ret, size_t num)
+{
+    volatile limb_t *rp = (volatile limb_t *)ret;
+    size_t i;
+
+    num /= sizeof(limb_t);
+
+    for (i = 0; i < num; i++)
+        rp[i] = 0;
+
+#if defined(__GNUC__) || defined(__clang__)
+    asm volatile("" : : "r"(ret) : "memory");
+#endif
+}
+
+/*
+ * Some compilers get arguably overzealous(*) when passing pointer to
+ * multi-dimensional array [such as vec384x] as 'const' argument.
+ * General direction seems to be to legitimize such constification,
+ * so it's argued that suppressing the warning is appropriate.
+ *
+ * (*)  http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm
+ */
+#if defined(__INTEL_COMPILER)
+# pragma warning(disable:167)
+# pragma warning(disable:556)
+#elif defined(__GNUC__) && !defined(__clang__)
+# pragma GCC diagnostic ignored "-Wpedantic"
+#elif defined(_MSC_VER)
+# pragma warning(disable: 4127 4189)
+#endif
+
+#if !defined(__wasm__)
+# include <stdlib.h>
+#endif
+
+#if defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca(s)
+# endif
+#elif defined(__sun)
+# include <alloca.h>
+#elif defined(_WIN32)
+# include <malloc.h>
+# ifndef alloca
+#  define alloca(s) _alloca(s)
+# endif
+#endif
+
+#endif /* __BLS12_381_ASM_VECT_H__ */

From 635f96099add5121be56768898ff8d5d8daf36ee Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 18 Jan 2023 17:19:52 -0600
Subject: [PATCH 002/200] add blst header files and README

---
 crypto/blst_src/README.md  |  10 +
 crypto/blst_src/blst.h     | 483 +++++++++++++++++++++++++++++++++++++
 crypto/blst_src/blst_aux.h | 102 ++++++++
 3 files changed, 595 insertions(+)
 create mode 100644 crypto/blst_src/README.md
 create mode 100644 crypto/blst_src/blst.h
 create mode 100644 crypto/blst_src/blst_aux.h

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
new file mode 100644
index 00000000000..ff835dbc640
--- /dev/null
+++ b/crypto/blst_src/README.md
@@ -0,0 +1,10 @@
+WIP
+
+Files copied from BLST repo https://github.com/supranational/blst. 
+TODO: License and copyright mention 
+
+content:
+- all <blst>/src files (C source files)
+- all <blst>/build   (assembly generated files)
+- <blst>/bindings/blst.h  (headers of external functions)
+- <blst>/bindings/blst_aux.h (headers of external aux functions)
\ No newline at end of file
diff --git a/crypto/blst_src/blst.h b/crypto/blst_src/blst.h
new file mode 100644
index 00000000000..24213ded2c5
--- /dev/null
+++ b/crypto/blst_src/blst.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLST_H__
+#define __BLST_H__
+
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__ size_t;
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
+                            && defined(__UINT64_TYPE__)
+typedef __UINT8_TYPE__  uint8_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __UINT64_TYPE__ uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#elif defined(__BLST_CGO__)
+typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
+# define bool _Bool
+#else
+# define bool int
+#endif
+
+#ifdef SWIG
+# define DEFNULL =NULL
+#elif defined __cplusplus
+# define DEFNULL =0
+#else
+# define DEFNULL
+#endif
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+    BLST_BAD_SCALAR,
+} BLST_ERROR;
+
+typedef uint8_t byte;
+typedef uint64_t limb_t;
+
+typedef struct { byte b[256/8]; } blst_scalar;
+typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr;
+typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp;
+/* 0 is "real" part, 1 is "imaginary" */
+typedef struct { blst_fp fp[2]; } blst_fp2;
+typedef struct { blst_fp2 fp2[3]; } blst_fp6;
+typedef struct { blst_fp6 fp6[2]; } blst_fp12;
+
+void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]);
+void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a);
+void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]);
+void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a);
+void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]);
+void blst_bendian_from_scalar(byte out[32], const blst_scalar *a);
+void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]);
+void blst_lendian_from_scalar(byte out[32], const blst_scalar *a);
+bool blst_scalar_fr_check(const blst_scalar *a);
+bool blst_sk_check(const blst_scalar *a);
+bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+void blst_sk_inverse(blst_scalar *out, const blst_scalar *a);
+bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len);
+bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len);
+
+#ifndef SWIG
+/*
+ * BLS12-381-specifc Fr operations.
+ */
+void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a);
+void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sqr(blst_fr *ret, const blst_fr *a);
+void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag);
+void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a);
+void blst_fr_inverse(blst_fr *ret, const blst_fr *a);
+#ifdef BLST_FR_PENTAROOT
+void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a);
+void blst_fr_pentapow(blst_fr *ret, const blst_fr *a);
+#endif
+
+void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]);
+void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a);
+void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a);
+void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a);
+
+/*
+ * BLS12-381-specifc Fp operations.
+ */
+void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a);
+void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a);
+void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count);
+void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sqr(blst_fp *ret, const blst_fp *a);
+void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag);
+void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a);
+void blst_fp_inverse(blst_fp *ret, const blst_fp *a);
+bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a);
+
+void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]);
+void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a);
+void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]);
+void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a);
+void blst_fp_from_bendian(blst_fp *ret, const byte a[48]);
+void blst_bendian_from_fp(byte ret[48], const blst_fp *a);
+void blst_fp_from_lendian(blst_fp *ret, const byte a[48]);
+void blst_lendian_from_fp(byte ret[48], const blst_fp *a);
+
+/*
+ * BLS12-381-specifc Fp2 operations.
+ */
+void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count);
+void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag);
+void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a);
+bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a);
+
+/*
+ * BLS12-381-specifc Fp12 operations.
+ */
+void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b);
+void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a,
+                                             const blst_fp6 *xy00z0);
+void blst_fp12_conjugate(blst_fp12 *a);
+void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a);
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n);
+bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b);
+bool blst_fp12_is_one(const blst_fp12 *a);
+bool blst_fp12_in_group(const blst_fp12 *a);
+const blst_fp12 *blst_fp12_one();
+#endif  // SWIG
+
+/*
+ * BLS12-381-specifc point operations.
+ */
+typedef struct { blst_fp x, y, z; } blst_p1;
+typedef struct { blst_fp x, y; } blst_p1_affine;
+
+void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a,
+                                      const blst_p1_affine *b);
+void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a,
+                                                const blst_p1_affine *b);
+void blst_p1_double(blst_p1 *out, const blst_p1 *a);
+void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p1_cneg(blst_p1 *p, bool cbit);
+void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in);
+void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in);
+bool blst_p1_on_curve(const blst_p1 *p);
+bool blst_p1_in_g1(const blst_p1 *p);
+bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b);
+bool blst_p1_is_inf(const blst_p1 *a);
+const blst_p1 *blst_p1_generator();
+
+bool blst_p1_affine_on_curve(const blst_p1_affine *p);
+bool blst_p1_affine_in_g1(const blst_p1_affine *p);
+bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b);
+bool blst_p1_affine_is_inf(const blst_p1_affine *a);
+const blst_p1_affine *blst_p1_affine_generator();
+
+typedef struct { blst_fp2 x, y, z; } blst_p2;
+typedef struct { blst_fp2 x, y; } blst_p2_affine;
+
+void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a,
+                                      const blst_p2_affine *b);
+void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a,
+                                                const blst_p2_affine *b);
+void blst_p2_double(blst_p2 *out, const blst_p2 *a);
+void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p2_cneg(blst_p2 *p, bool cbit);
+void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in);
+void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in);
+bool blst_p2_on_curve(const blst_p2 *p);
+bool blst_p2_in_g2(const blst_p2 *p);
+bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b);
+bool blst_p2_is_inf(const blst_p2 *a);
+const blst_p2 *blst_p2_generator();
+
+bool blst_p2_affine_on_curve(const blst_p2_affine *p);
+bool blst_p2_affine_in_g2(const blst_p2_affine *p);
+bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b);
+bool blst_p2_affine_is_inf(const blst_p2_affine *a);
+const blst_p2_affine *blst_p2_affine_generator();
+
+/*
+ * Multi-scalar multiplications and other multi-point operations.
+ */
+
+void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[],
+                        size_t npoints);
+void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits,
+                                    const blst_p1_affine *const points[],
+                                    size_t npoints);
+size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[],
+                        size_t npoints);
+void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits,
+                                    const blst_p2_affine *const points[],
+                                    size_t npoints);
+size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+/*
+ * Hash-to-curve operations.
+ */
+#ifndef SWIG
+void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL);
+void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL);
+#endif
+
+void blst_encode_to_g1(blst_p1 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g1(blst_p1 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+void blst_encode_to_g2(blst_p2 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g2(blst_p2 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+/*
+ * Zcash-compatible serialization/deserialization.
+ */
+void blst_p1_serialize(byte out[96], const blst_p1 *in);
+void blst_p1_compress(byte out[48], const blst_p1 *in);
+void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in);
+void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in);
+BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]);
+BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]);
+
+void blst_p2_serialize(byte out[192], const blst_p2 *in);
+void blst_p2_compress(byte out[96], const blst_p2 *in);
+void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in);
+void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in);
+BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]);
+BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]);
+
+/*
+ * Specification defines two variants, 'minimal-signature-size' and
+ * 'minimal-pubkey-size'. To unify appearance we choose to distinguish
+ * them by suffix referring to the public key type, more specifically
+ * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to
+ * 'minimal-signature-size'. It might appear a bit counterintuitive
+ * in sign call, but no matter how you twist it, something is bound to
+ * turn a little odd.
+ */
+/*
+ * Secret-key operations.
+ */
+void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                 const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash,
+                                          const blst_scalar *SK);
+void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash,
+                                          const blst_scalar *SK);
+
+/*
+ * Pairing interface.
+ */
+#ifndef SWIG
+void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q,
+                                      const blst_p1_affine *P);
+void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f);
+void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q);
+void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68],
+                                            const blst_p1_affine *P);
+bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2);
+#endif
+
+#ifdef __BLST_CGO__
+typedef limb_t blst_pairing;
+#elif defined(__BLST_RUST_BINDGEN__)
+typedef struct {} blst_pairing;
+#else
+typedef struct blst_opaque blst_pairing;
+#endif
+
+size_t blst_pairing_sizeof();
+void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL);
+const byte *blst_pairing_get_dst(const blst_pairing *ctx);
+void blst_pairing_commit(blst_pairing *ctx);
+BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx,
+                                           const blst_p2_affine *PK,
+                                           const blst_p1_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                            const blst_p2_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p1_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx,
+                                                 const blst_p2_affine *PK,
+                                                 const blst_p1_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                                  const blst_p2_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p1_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx,
+                                           const blst_p1_affine *PK,
+                                           const blst_p2_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                            const blst_p1_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p2_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx,
+                                                 const blst_p1_affine *PK,
+                                                 const blst_p2_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                                  const blst_p1_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p2_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1);
+bool blst_pairing_finalverify(const blst_pairing *ctx,
+                              const blst_fp12 *gtsig DEFNULL);
+
+
+/*
+ * Customarily applications aggregate signatures separately.
+ * In which case application would have to pass NULLs for |signature|
+ * to blst_pairing_aggregate calls and pass aggregated signature
+ * collected with these calls to blst_pairing_finalverify. Inputs are
+ * Zcash-compatible "straight-from-wire" byte vectors, compressed or
+ * not.
+ */
+BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in,
+                                              const byte *zwire);
+BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in,
+                                              const byte *zwire);
+
+void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature);
+void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature);
+
+/*
+ * "One-shot" CoreVerify entry points.
+ */
+BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk,
+                                     const blst_p2_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk,
+                                     const blst_p1_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+
+extern const blst_p1_affine BLS12_381_G1;
+extern const blst_p1_affine BLS12_381_NEG_G1;
+extern const blst_p2_affine BLS12_381_G2;
+extern const blst_p2_affine BLS12_381_NEG_G2;
+
+#include "blst_aux.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/crypto/blst_src/blst_aux.h b/crypto/blst_src/blst_aux.h
new file mode 100644
index 00000000000..6d444fc1729
--- /dev/null
+++ b/crypto/blst_src/blst_aux.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLST_AUX_H__
+#define __BLST_AUX_H__
+/*
+ * This file lists interfaces that might be promoted to blst.h or removed,
+ * depending on their proven/unproven worthiness.
+ */
+
+void blst_fr_to(blst_fr *ret, const blst_fr *a);
+void blst_fr_from(blst_fr *ret, const blst_fr *a);
+
+void blst_fp_to(blst_fp *ret, const blst_fp *a);
+void blst_fp_from(blst_fp *ret, const blst_fp *a);
+
+bool blst_fp_is_square(const blst_fp *a);
+bool blst_fp2_is_square(const blst_fp2 *a);
+
+void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in);
+void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in);
+
+/*
+ * Below functions produce both point and deserialized outcome of
+ * SkToPk and Sign. However, deserialized outputs are pre-decorated
+ * with sign and infinity bits. This means that you have to bring the
+ * output into compliance prior returning to application. If you want
+ * compressed point value, then do [equivalent of]
+ *
+ *  byte temp[96];
+ *  blst_sk_to_pk2_in_g1(temp, out_pk, SK);
+ *  temp[0] |= 0x80;
+ *  memcpy(out, temp, 48);
+ *
+ * Otherwise do
+ *
+ *  blst_sk_to_pk2_in_g1(out, out_pk, SK);
+ *  out[0] &= ~0x20;
+ *
+ * Either |out| or |out_<point>| can be NULL.
+ */
+void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig,
+                         const blst_p2 *hash, const blst_scalar *SK);
+void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig,
+                         const blst_p1 *hash, const blst_scalar *SK);
+
+typedef struct {} blst_uniq;
+
+size_t blst_uniq_sizeof(size_t n_nodes);
+void blst_uniq_init(blst_uniq *tree);
+bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len);
+
+#ifdef expand_message_xmd
+void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                        const unsigned char *aug, size_t aug_len,
+                        const unsigned char *msg, size_t msg_len,
+                        const unsigned char *DST, size_t DST_len);
+#else
+void blst_expand_message_xmd(byte *out, size_t out_len,
+                             const byte *msg, size_t msg_len,
+                             const byte *DST, size_t DST_len);
+#endif
+
+void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                            size_t nbits);
+void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                            size_t nbits);
+
+void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q,
+                                                   const blst_p1_affine *p);
+blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx);
+void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a);
+
+void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                    const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                      const byte *salt, size_t salt_len,
+                      const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                    const byte *salt, size_t salt_len,
+                    const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_derive_master_eip2333(blst_scalar *out_SK,
+                                const byte *IKM, size_t IKM_len);
+void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK,
+                               uint32_t child_index);
+
+void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex);
+void blst_fr_from_hexascii(blst_fr *ret, const byte *hex);
+void blst_fp_from_hexascii(blst_fp *ret, const byte *hex);
+
+size_t blst_p1_sizeof();
+size_t blst_p1_affine_sizeof();
+size_t blst_p2_sizeof();
+size_t blst_p2_affine_sizeof();
+size_t blst_fp12_sizeof();
+#endif

From abe0fe10ca53e3880816e77b4895296a38b64813 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 18 Jan 2023 17:20:57 -0600
Subject: [PATCH 003/200] cgo directives to compile blst files

---
 crypto/bls.go            | 3 +++
 crypto/bls12381_utils.go | 3 +--
 crypto/bls12381_utils.h  | 3 +--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 1d725ebab63..9e64d283c2d 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -33,7 +33,10 @@ package crypto
 
 // #cgo CFLAGS: -g -Wall -std=c99
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
+// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
+// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls_include.h"
+// #include "blst.h"
 import "C"
 
 import (
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 4138d35a599..f9a94beb1ee 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -7,10 +7,9 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low
+// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls12381_utils.h"
-// #include "bls_include.h"
 import "C"
 import (
 	"errors"
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index c7e3587f664..de2efe9cb53 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -8,8 +8,7 @@
 #define _REL_MISC_INCLUDE_H
 
 #include "relic.h"
-
-typedef uint8_t byte;
+#include "blst.h"
 
 #define VALID     RLC_OK
 #define INVALID   RLC_ERR

From 59be5248277a2014b4ea7a4c1fa86f5eed5299ad Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 23 Jan 2023 20:11:41 -0600
Subject: [PATCH 004/200] update README and clean up C directives

---
 crypto/bls.go             |  3 ---
 crypto/bls12381_utils.go  |  3 +++
 crypto/blst_src/README.md | 14 ++++++++++----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 9e64d283c2d..1d725ebab63 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -33,10 +33,7 @@ package crypto
 
 // #cgo CFLAGS: -g -Wall -std=c99
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
-// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
-// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls_include.h"
-// #include "blst.h"
 import "C"
 
 import (
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index f9a94beb1ee..7e327571e47 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -9,7 +9,10 @@ package crypto
 
 // #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
+// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
+// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
+// #include "blst.h"
 import "C"
 import (
 	"errors"
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index ff835dbc640..c5867bcd742 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -1,9 +1,15 @@
-WIP
+All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst
+specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>. 
 
-Files copied from BLST repo https://github.com/supranational/blst. 
-TODO: License and copyright mention 
+ Copyright Supranational LLC
+ Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ SPDX-License-Identifier: Apache-2.0
 
-content:
+While BLST exports multiple functions and tools, the implementation in Flow crypto requires access to low level functions. Some of these tools are not exported by BLST, others would need to be used without paying for the cgo cost, and therefore without using the Go bindings in BLST. 
+
+
+The folder contains:
+- BLST LICENSE file
 - all <blst>/src files (C source files)
 - all <blst>/build   (assembly generated files)
 - <blst>/bindings/blst.h  (headers of external functions)

From 56b4b5608f17a02c6b68c098f1fa05d4572e1649 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 25 Jan 2023 12:48:49 -0800
Subject: [PATCH 005/200] remove non code files

---
 crypto/blst_src/README.md        |   2 +-
 crypto/blst_src/blst_t.hpp       | 538 -------------------------------
 crypto/blst_src/build/refresh.sh |  49 ---
 3 files changed, 1 insertion(+), 588 deletions(-)
 delete mode 100644 crypto/blst_src/blst_t.hpp
 delete mode 100755 crypto/blst_src/build/refresh.sh

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index c5867bcd742..12bc7b863ca 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -10,7 +10,7 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp
 
 The folder contains:
 - BLST LICENSE file
-- all <blst>/src files (C source files)
+- all <blst>/src/*.c and <blst>/src/*.h files (C source files)
 - all <blst>/build   (assembly generated files)
 - <blst>/bindings/blst.h  (headers of external functions)
 - <blst>/bindings/blst_aux.h (headers of external aux functions)
\ No newline at end of file
diff --git a/crypto/blst_src/blst_t.hpp b/crypto/blst_src/blst_t.hpp
deleted file mode 100644
index 1b150da30ce..00000000000
--- a/crypto/blst_src/blst_t.hpp
+++ /dev/null
@@ -1,538 +0,0 @@
-// Copyright Supranational LLC
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef __BLST_T_HPP__
-#define __BLST_T_HPP__
-
-/*
- * These templates, blst_384_t and blst_256_t, allow to instantiate slim
- * C++ shims to blst assembly with arbitrary moduli. Well, not literally
- * arbitrary, as there are limitations. Most notably blst_384_t can not
- * actually accommodate 384-bit moduli, only 383 and narrower. This is
- * because of ct_inverse_mod_383's limitation. Though if you abstain
- * from the reciprocal() method, even 384-bit modulus would work. As for
- * blst_256_t, modulus has to be not larger than 2^256-2^192-1.
- */
-
-#ifdef __GNUC__
-# pragma GCC diagnostic push
-# pragma GCC diagnostic ignored "-Wunused-function"
-#endif
-
-extern "C" {
-#include "vect.h"
-}
-#include "bytes.h"
-
-#undef launder // avoid conflict with C++ >=17
-
-#ifdef __GNUC__
-# pragma GCC diagnostic pop
-#endif
-
-static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t n)
-{
-    const unsigned int nbits = sizeof(inp[0])*8;
-    unsigned int align = 0;
-    limb_t top = inp[n-1];
-
-    if (top) {
-        while ((top >> (nbits-1)) == 0)
-            top <<= 1, align++;
-    }
-    if (align) {
-        while (--n) {
-            limb_t next = inp[n-1];
-            out[n] = top | next >> (nbits-align);
-            top = next << align;
-        }
-        out[0] = top;
-    } else {
-        for (size_t i = 0; i < n-1; i++)
-             out[i] = inp[i];
-        out[n-1] = top;
-    }
-}
-
-constexpr static inline size_t vec_nbits(const limb_t *inp, size_t n)
-{
-    const unsigned int nbits = sizeof(inp[0])*8;
-    size_t align = 0;
-    limb_t top = inp[n-1];
-
-    while ((top >> (nbits-1)) == 0)
-        top <<= 1, align++;
-
-    return n*nbits - align;
-}
-
-template<const vec384 MOD, const limb_t M0, const vec384 RR, const vec384 ONE>
-class blst_384_t {
-private:
-    vec384 val;
-
-    inline operator const limb_t*() const           { return val;    }
-    inline operator limb_t*()                       { return val;    }
-    inline limb_t& operator[](size_t i)             { return val[i]; }
-    inline const limb_t& operator[](size_t i) const { return val[i]; }
-
-public:
-    static const size_t n = sizeof(vec384)/sizeof(limb_t);
-    static const size_t nbits = vec_nbits(MOD, n);
-    typedef byte pow_t[384/8];
-
-    inline blst_384_t() {}
-    inline blst_384_t(const vec384 p, bool align = false)
-    {
-        if (align)
-            vec_left_align(val, p, n);
-        else
-            vec_copy(val, p, sizeof(val));
-    }
-    inline blst_384_t(uint64_t a)
-    {
-        vec_zero(val, sizeof(val));
-        val[0] = a;
-        if (a) to();
-    }
-    inline blst_384_t(int a) : blst_384_t((uint64_t)a) {}
-
-    inline void to_scalar(pow_t& scalar) const
-    {
-        const union {
-            long one;
-            char little;
-        } is_endian = { 1 };
-
-        if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) {
-            from_mont_384((limb_t *)scalar, val, MOD, M0);
-        } else {
-            vec384 out;
-            from_mont_384(out, val, MOD, M0);
-            le_bytes_from_limbs(scalar, out, sizeof(pow_t));
-            vec_zero(out, sizeof(out));
-        }
-    }
-
-    static inline const blst_384_t& one()
-    {   return *reinterpret_cast<const blst_384_t*>(ONE);   }
-
-    inline blst_384_t& to()
-    {   mul_mont_384(val, RR, val, MOD, M0);        return *this;   }
-    inline blst_384_t& from()
-    {   from_mont_384(val, val, MOD, M0);           return *this;   }
-
-    inline void store(limb_t *p) const
-    {   vec_copy(p, val, sizeof(val));   }
-
-    inline blst_384_t& operator+=(const blst_384_t& b)
-    {   add_mod_384(val, val, b, MOD);              return *this;   }
-    friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b)
-    {
-        blst_384_t ret;
-        add_mod_384(ret, a, b, MOD);
-        return ret;
-    }
-
-    inline blst_384_t& operator<<=(unsigned l)
-    {   lshift_mod_384(val, val, l, MOD);           return *this;   }
-    friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l)
-    {
-        blst_384_t ret;
-        lshift_mod_384(ret, a, l, MOD);
-        return ret;
-    }
-
-    inline blst_384_t& operator>>=(unsigned r)
-    {   rshift_mod_384(val, val, r, MOD);           return *this;   }
-    friend inline blst_384_t operator>>(blst_384_t a, unsigned r)
-    {
-        blst_384_t ret;
-        rshift_mod_384(ret, a, r, MOD);
-        return ret;
-    }
-
-    inline blst_384_t& operator-=(const blst_384_t& b)
-    {   sub_mod_384(val, val, b, MOD);              return *this;   }
-    friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b)
-    {
-        blst_384_t ret;
-        sub_mod_384(ret, a, b, MOD);
-        return ret;
-    }
-
-    inline blst_384_t& cneg(bool flag)
-    {   cneg_mod_384(val, val, flag, MOD);          return *this;   }
-    friend inline blst_384_t cneg(const blst_384_t& a, bool flag)
-    {
-        blst_384_t ret;
-        cneg_mod_384(ret, a, flag, MOD);
-        return ret;
-    }
-    friend inline blst_384_t operator-(const blst_384_t& a)
-    {
-        blst_384_t ret;
-        cneg_mod_384(ret, a, true, MOD);
-        return ret;
-    }
-
-    inline blst_384_t& operator*=(const blst_384_t& a)
-    {
-        if (this == &a) sqr_mont_384(val, val, MOD, M0);
-        else            mul_mont_384(val, val, a, MOD, M0);
-        return *this;
-    }
-    friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b)
-    {
-        blst_384_t ret;
-        if (&a == &b)   sqr_mont_384(ret, a, MOD, M0);
-        else            mul_mont_384(ret, a, b, MOD, M0);
-        return ret;
-    }
-
-    // simplified exponentiation, but mind the ^ operator's precedence!
-    friend inline blst_384_t operator^(const blst_384_t& a, unsigned p)
-    {
-        if (p < 2) {
-            abort();
-        } else if (p == 2) {
-            blst_384_t ret;
-            sqr_mont_384(ret, a, MOD, M0);
-            return ret;
-        } else {
-            blst_384_t ret;
-            sqr_mont_384(ret, a, MOD, M0);
-            for (p -= 2; p--;)
-                mul_mont_384(ret, ret, a, MOD, M0);
-            return ret;
-        }
-    }
-    inline blst_384_t& operator^=(unsigned p)
-    {
-        if (p < 2) {
-            abort();
-        } else if (p == 2) {
-            sqr_mont_384(val, val, MOD, M0);
-            return *this;
-        }
-        return *this = *this^p;
-    }
-    inline blst_384_t operator()(unsigned p)
-    {   return *this^p;   }
-    friend inline blst_384_t sqr(const blst_384_t& a)
-    {   return a^2;   }
-
-    inline bool is_zero() const
-    {   return vec_is_zero(val, sizeof(val));   }
-
-    inline void zero()
-    {   vec_zero(val, sizeof(val));   }
-
-    blst_384_t reciprocal() const
-    {
-        static const blst_384_t MODx{MOD, true};
-        static const blst_384_t RRx4 = *reinterpret_cast<const blst_384_t*>(RR)<<2;
-        union { vec768 x; vec384 r[2]; } temp;
-
-        ct_inverse_mod_383(temp.x, val, MOD, MODx);
-        redc_mont_384(temp.r[0], temp.x, MOD, M0);
-        mul_mont_384(temp.r[0], temp.r[0], RRx4, MOD, M0);
-
-        return *reinterpret_cast<blst_384_t*>(temp.r[0]);
-    }
-    friend inline blst_384_t operator/(unsigned one, const blst_384_t& a)
-    {
-        if (one == 1)
-            return a.reciprocal();
-        abort();
-    }
-    friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b)
-    {   return a * b.reciprocal();   }
-    inline blst_384_t& operator/=(const blst_384_t& a)
-    {   return *this *= a.reciprocal();   }
-
-#ifndef NDEBUG
-    inline blst_384_t(const char *hexascii)
-    {   limbs_from_hexascii(val, sizeof(val), hexascii); to();   }
-
-    friend inline bool operator==(const blst_384_t& a, const blst_384_t& b)
-    {   return vec_is_equal(a, b, sizeof(vec384));   }
-    friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b)
-    {   return !vec_is_equal(a, b, sizeof(vec384));   }
-
-# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard
-    friend std::ostream& operator<<(std::ostream& os, const blst_384_t& obj)
-    {
-        unsigned char be[sizeof(obj)];
-        char buf[2+2*sizeof(obj)+1], *str = buf;
-
-        be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj));
-
-        *str++ = '0', *str++ = 'x';
-        for (size_t i = 0; i < sizeof(obj); i++)
-            *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]);
-	*str = '\0';
-
-        return os << buf;
-    }
-# endif
-#endif
-};
-
-template<const vec256 MOD, const limb_t M0, const vec256 RR, const vec256 ONE>
-class blst_256_t {
-    vec256 val;
-
-    inline operator const limb_t*() const           { return val;    }
-    inline operator limb_t*()                       { return val;    }
-    inline limb_t& operator[](size_t i)             { return val[i]; }
-    inline const limb_t& operator[](size_t i) const { return val[i]; }
-
-public:
-    static const size_t n = sizeof(vec256)/sizeof(limb_t);
-    static const size_t nbits = vec_nbits(MOD, n);
-    typedef byte pow_t[256/8];
-
-    inline blst_256_t() {}
-    inline blst_256_t(const vec256 p, bool align = false)
-    {
-        if (align)
-            vec_left_align(val, p, n);
-        else
-            vec_copy(val, p, sizeof(val));
-    }
-    inline blst_256_t(uint64_t a)
-    {
-        vec_zero(val, sizeof(val));
-        val[0] = a;
-        if (a) to();
-    }
-    inline blst_256_t(int a) : blst_256_t((uint64_t)a) {}
-
-    inline void to_scalar(pow_t& scalar) const
-    {
-        const union {
-            long one;
-            char little;
-        } is_endian = { 1 };
-
-        if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) {
-            from_mont_256((limb_t *)scalar, val, MOD, M0);
-        } else {
-            vec256 out;
-            from_mont_256(out, val, MOD, M0);
-            le_bytes_from_limbs(scalar, out, sizeof(pow_t));
-            vec_zero(out, sizeof(out));
-        }
-    }
-
-    static inline const blst_256_t& one()
-    {   return *reinterpret_cast<const blst_256_t*>(ONE);   }
-
-    inline blst_256_t& to()
-    {   mul_mont_sparse_256(val, val, RR, MOD, M0); return *this;   }
-    inline blst_256_t& to(const uint64_t a[2*n])
-    {
-        mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0);
-        vec256 lo{0};
-        add_mod_256(lo, lo, (const limb_t*)a, MOD);
-        add_mod_256(val, val, lo, MOD);
-        mul_mont_sparse_256(val, RR, val, MOD, M0);
-
-        return *this;
-    }
-    blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false)
-    {
-        vec_zero(val, sizeof(val));
-
-        vec256 digit, zero{0};
-        size_t rem = (n - 1) % 32 + 1;
-        n -= rem;
-
-        if (le) {
-            limbs_from_le_bytes(val, bytes += n, rem);
-            mul_mont_sparse_256(val, RR, val, MOD, M0);
-            while (n) {
-                limbs_from_le_bytes(digit, bytes -= 32, 32);
-                add_mod_256(digit, digit, zero, MOD);
-                add_mod_256(val, val, digit, MOD);
-                mul_mont_sparse_256(val, RR, val, MOD, M0);
-                n -= 32;
-            }
-        } else {
-            limbs_from_be_bytes(val, bytes, rem);
-            mul_mont_sparse_256(val, RR, val, MOD, M0);
-            bytes += rem;
-            while (n) {
-                limbs_from_be_bytes(digit, bytes, 32);
-                add_mod_256(digit, digit, zero, MOD);
-                add_mod_256(val, val, digit, MOD);
-                mul_mont_sparse_256(val, RR, val, MOD, M0);
-                bytes += 32;
-                n -= 32;
-            }
-        }
-
-        return *this;
-    }
-
-    inline blst_256_t& from()
-    {   from_mont_256(val, val, MOD, M0); return *this;   }
-
-    inline void store(limb_t *p) const
-    {   vec_copy(p, val, sizeof(val));   }
-
-    inline blst_256_t& operator+=(const blst_256_t& b)
-    {   add_mod_256(val, val, b, MOD);              return *this;   }
-    friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b)
-    {
-        blst_256_t ret;
-        add_mod_256(ret, a, b, MOD);
-        return ret;
-    }
-
-    inline blst_256_t& operator<<=(unsigned l)
-    {   lshift_mod_256(val, val, l, MOD);           return *this;   }
-    friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l)
-    {
-        blst_256_t ret;
-        lshift_mod_256(ret, a, l, MOD);
-        return ret;
-    }
-
-    inline blst_256_t& operator>>=(unsigned r)
-    {   lshift_mod_256(val, val, r, MOD);           return *this;   }
-    friend inline blst_256_t operator>>(blst_256_t a, unsigned r)
-    {
-        blst_256_t ret;
-        lshift_mod_256(ret, a, r, MOD);
-        return ret;
-    }
-
-    inline blst_256_t& operator-=(const blst_256_t& b)
-    {   sub_mod_256(val, val, b, MOD);              return *this;   }
-    friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b)
-    {
-        blst_256_t ret;
-        sub_mod_256(ret, a, b, MOD);
-        return ret;
-    }
-
-    inline blst_256_t& cneg(bool flag)
-    {   cneg_mod_256(val, val, flag, MOD);          return *this;   }
-    friend inline blst_256_t cneg(const blst_256_t& a, bool flag)
-    {
-        blst_256_t ret;
-        cneg_mod_256(ret, a, flag, MOD);
-        return ret;
-    }
-    friend inline blst_256_t operator-(const blst_256_t& a)
-    {
-        blst_256_t ret;
-        cneg_mod_256(ret, a, true, MOD);
-        return ret;
-    }
-
-    inline blst_256_t& operator*=(const blst_256_t& a)
-    {
-        if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0);
-        else            mul_mont_sparse_256(val, val, a, MOD, M0);
-        return *this;
-    }
-    friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b)
-    {
-        blst_256_t ret;
-        if (&a == &b)   sqr_mont_sparse_256(ret, a, MOD, M0);
-        else            mul_mont_sparse_256(ret, a, b, MOD, M0);
-        return ret;
-    }
-
-    // simplified exponentiation, but mind the ^ operator's precedence!
-    friend inline blst_256_t operator^(const blst_256_t& a, unsigned p)
-    {
-        if (p < 2) {
-            abort();
-        } else if (p == 2) {
-            blst_256_t ret;
-            sqr_mont_sparse_256(ret, a, MOD, M0);
-            return ret;
-        } else {
-            blst_256_t ret;
-            sqr_mont_sparse_256(ret, a, MOD, M0);
-            for (p -= 2; p--;)
-                mul_mont_sparse_256(ret, ret, a, MOD, M0);
-            return ret;
-        }
-    }
-    inline blst_256_t& operator^=(unsigned p)
-    {
-        if (p < 2) {
-            abort();
-        } else if (p == 2) {
-            sqr_mont_sparse_256(val, val, MOD, M0);
-            return *this;
-        }
-        return *this = *this^p;
-    }
-    inline blst_256_t operator()(unsigned p)
-    {   return *this^p;   }
-    friend inline blst_256_t sqr(const blst_256_t& a)
-    {   return a^2;   }
-
-    inline bool is_zero() const
-    {   return vec_is_zero(val, sizeof(val));   }
-
-    inline void zero()
-    {   vec_zero(val, sizeof(val));   }
-
-    blst_256_t reciprocal() const
-    {
-        static const blst_256_t MODx{MOD, true};
-        union { vec512 x; vec256 r[2]; } temp;
-
-        ct_inverse_mod_256(temp.x, val, MOD, MODx);
-        redc_mont_256(temp.r[0], temp.x, MOD, M0);
-        mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0);
-
-        return *reinterpret_cast<blst_256_t*>(temp.r[0]);
-    }
-    friend inline blst_256_t operator/(int one, const blst_256_t& a)
-    {
-        if (one == 1)
-            return a.reciprocal();
-        abort();
-    }
-    friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b)
-    {   return a * b.reciprocal();   }
-    inline blst_256_t& operator/=(const blst_256_t& a)
-    {   return *this *= a.reciprocal();   }
-
-#ifndef NDEBUG
-    inline blst_256_t(const char *hexascii)
-    {   limbs_from_hexascii(val, sizeof(val), hexascii); to();   }
-
-    friend inline bool operator==(const blst_256_t& a, const blst_256_t& b)
-    {   return vec_is_equal(a, b, sizeof(vec256));   }
-    friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b)
-    {   return !vec_is_equal(a, b, sizeof(vec256));   }
-
-# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard
-    friend std::ostream& operator<<(std::ostream& os, const blst_256_t& obj)
-    {
-        unsigned char be[sizeof(obj)];
-        char buf[2+2*sizeof(obj)+1], *str=buf;
-
-        be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj));
-
-        *str++ = '0', *str++ = 'x';
-        for (size_t i = 0; i < sizeof(obj); i++)
-            *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]);
-	*str = '\0';
-
-        return os << buf;
-    }
-# endif
-#endif
-};
-#endif
diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh
deleted file mode 100755
index e8c8137c287..00000000000
--- a/crypto/blst_src/build/refresh.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/sh
-
-HERE=`dirname $0`
-cd "${HERE}"
-
-PERL=${PERL:-perl}
-
-for pl in ../src/asm/*-x86_64.pl; do
-    s=`basename $pl .pl`.asm
-    expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s)
-    s=`basename $pl .pl`.s
-    (set -x; ${PERL} $pl elf > elf/$s)
-    (set -x; ${PERL} $pl mingw64 > coff/$s)
-    (set -x; ${PERL} $pl macosx > mach-o/$s)
-done
-
-for pl in ../src/asm/*-armv8.pl; do
-    s=`basename $pl .pl`.asm
-    (set -x; ${PERL} $pl win64 > win64/$s)
-    s=`basename $pl .pl`.S
-    (set -x; ${PERL} $pl linux64 > elf/$s)
-    (set -x; ${PERL} $pl coff64 > coff/$s)
-    (set -x; ${PERL} $pl ios64 > mach-o/$s)
-done
-
-( cd ../bindings;
-  echo "LIBRARY blst"
-  echo
-  echo "EXPORTS"
-  cc -E blst.h | \
-  ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) &&  print "\t$1\n" }'
-  echo
-) > win64/blst.def
-
-if which bindgen > /dev/null 2>&1; then
-  ( cd ../bindings; set -x;
-    bindgen --opaque-type blst_pairing \
-            --opaque-type blst_uniq \
-            --with-derive-default \
-            --with-derive-eq \
-            --size_t-is-usize \
-            --rustified-enum BLST.\* \
-        blst.h -- -D__BLST_RUST_BINDGEN__ \
-    | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs
-  )
-else
-    echo "Install Rust bindgen with 'cargo install bindgen'" 1>&2
-    exit 1
-fi

From 3c5accd9ac33e2f035e3156729b893ae21c21b07 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 14 Feb 2023 00:57:16 -0600
Subject: [PATCH 006/200] disable c99 and compile blst src and assembly

---
 crypto/bls.go                                 |    2 +-
 crypto/bls12381_utils.go                      |    5 +-
 crypto/bls12381_utils.h                       |    2 +-
 crypto/bls_crossBLST_test.go                  |   47 +-
 crypto/bls_multisig.go                        |    2 +-
 crypto/bls_thresholdsign.go                   |    2 +-
 .../build/assembly.S => blst_assembly.S}      |    0
 crypto/blst_include.h                         |    7 +
 crypto/{blst_src/server.c => blst_src.c}      |    0
 crypto/blst_src/asm/add_mod_256-armv8.pl      |  412 ---
 crypto/blst_src/asm/add_mod_256-x86_64.pl     |  547 ----
 crypto/blst_src/asm/add_mod_384-armv8.pl      |  937 ------
 crypto/blst_src/asm/add_mod_384-x86_64.pl     | 1500 ---------
 crypto/blst_src/asm/add_mod_384x384-x86_64.pl |  260 --
 crypto/blst_src/asm/arm-xlate.pl              |  386 ---
 .../blst_src/asm/ct_inverse_mod_256-armv8.pl  |  586 ----
 .../blst_src/asm/ct_inverse_mod_256-x86_64.pl |  837 ------
 .../blst_src/asm/ct_inverse_mod_384-armv8.pl  |  610 ----
 .../asm/ct_is_square_mod_384-armv8.pl         |  401 ---
 .../asm/ct_is_square_mod_384-x86_64.pl        |  494 ---
 .../asm/ctq_inverse_mod_384-x86_64.pl         |  886 ------
 .../asm/ctx_inverse_mod_384-x86_64.pl         |  995 ------
 crypto/blst_src/asm/div3w-armv8.pl            |  122 -
 crypto/blst_src/asm/div3w-x86_64.pl           |  184 --
 crypto/blst_src/asm/mul_mont_256-armv8.pl     |  409 ---
 crypto/blst_src/asm/mul_mont_384-armv8.pl     | 2015 -------------
 crypto/blst_src/asm/mulq_mont_256-x86_64.pl   |  513 ----
 crypto/blst_src/asm/mulq_mont_384-x86_64.pl   | 2675 -----------------
 crypto/blst_src/asm/mulx_mont_256-x86_64.pl   |  486 ---
 crypto/blst_src/asm/mulx_mont_384-x86_64.pl   | 2384 ---------------
 crypto/blst_src/asm/sha256-armv8.pl           |  541 ----
 crypto/blst_src/asm/sha256-portable-x86_64.pl |  337 ---
 crypto/blst_src/asm/sha256-x86_64.pl          |  789 -----
 crypto/blst_src/asm/x86_64-xlate.pl           | 1781 -----------
 crypto/blst_src/client_min_pk.c               |    4 +-
 crypto/blst_src/client_min_sig.c              |    4 +-
 crypto/dkg_feldmanvss.go                      |    2 +-
 crypto/dkg_feldmanvssq.go                     |    2 +-
 crypto/dkg_jointfeldman.go                    |    2 +-
 crypto/spock.go                               |    2 +-
 40 files changed, 48 insertions(+), 21122 deletions(-)
 rename crypto/{blst_src/build/assembly.S => blst_assembly.S} (100%)
 create mode 100644 crypto/blst_include.h
 rename crypto/{blst_src/server.c => blst_src.c} (100%)
 delete mode 100755 crypto/blst_src/asm/add_mod_256-armv8.pl
 delete mode 100755 crypto/blst_src/asm/add_mod_256-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/add_mod_384-armv8.pl
 delete mode 100755 crypto/blst_src/asm/add_mod_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/add_mod_384x384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/arm-xlate.pl
 delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
 delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
 delete mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
 delete mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/div3w-armv8.pl
 delete mode 100755 crypto/blst_src/asm/div3w-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/mul_mont_256-armv8.pl
 delete mode 100755 crypto/blst_src/asm/mul_mont_384-armv8.pl
 delete mode 100755 crypto/blst_src/asm/mulq_mont_256-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/mulq_mont_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/mulx_mont_256-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/mulx_mont_384-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/sha256-armv8.pl
 delete mode 100755 crypto/blst_src/asm/sha256-portable-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/sha256-x86_64.pl
 delete mode 100755 crypto/blst_src/asm/x86_64-xlate.pl

diff --git a/crypto/bls.go b/crypto/bls.go
index 1d725ebab63..6786f00c4d5 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -31,7 +31,7 @@ package crypto
 //  - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf)
 //  - implement a G1/G2 swap (signatures on G2 and public keys on G1)
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls_include.h"
 import "C"
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 7e327571e47..fa931cffab6 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -7,18 +7,17 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
-// #include "blst.h"
 import "C"
 import (
 	"errors"
 )
 
-// Go wrappers to Relic C types
+// Go wrappers around Relic C types
 // Relic is compiled with ALLOC=AUTO
 type pointG1 C.ep_st
 type pointG2 C.ep2_st
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index de2efe9cb53..d6978d6188d 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -8,7 +8,7 @@
 #define _REL_MISC_INCLUDE_H
 
 #include "relic.h"
-#include "blst.h"
+#include "blst_include.h"
 
 #define VALID     RLC_OK
 #define INVALID   RLC_ERR
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index f2ef6f16431..9ed78de2873 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -16,19 +16,26 @@ package crypto
 // both libraries might have made different choices. It is nevertheless a good flag for possible bugs or deviations
 // from the standard as both libraries are being developed.
 
-import (
+/*import (
 	"testing"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	blst "github.com/supranational/blst/bindings/go"
 	"pgregory.net/rapid"
-)
+
+	"github.com/onflow/flow-go/crypto"
+)*/
+
+// TODO: this file can't compile because of duplicate C and assembly symbols (the ones used
+// by the current library and the same ones used by the imported package BLST). Unfortunately,
+// cgo doesn't differentiate the two symbols. These tests need to be rewritten using the internal
+// BLST C exports, instead of importing the Go BLST package.
 
 // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library
-func validPrivateKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
-	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
+/*func validPrivateKeyBytesFlow(t *rapid.T) []byte {
+	seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
+	sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed)
 	// TODO: require.NoError(t, err) seems to mess with rapid
 	if err != nil {
 		assert.FailNow(t, "failed key generation")
@@ -38,18 +45,18 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte {
 
 // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library
 func validPublicKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
-	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
+	seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
+	sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed)
 	require.NoError(t, err)
 	return sk.PublicKey().Encode()
 }
 
 // validSignatureBytesFlow generates bytes of a valid signature in Flow library
 func validSignatureBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
-	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
+	seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte)
+	sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed)
 	require.NoError(t, err)
-	hasher := NewExpandMsgXOFKMAC128("random_tag")
+	hasher := crypto.NewExpandMsgXOFKMAC128("random_tag")
 	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte)
 	signature, err := sk.Sign(message, hasher)
 	require.NoError(t, err)
@@ -58,14 +65,14 @@ func validSignatureBytesFlow(t *rapid.T) []byte {
 
 // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library
 func validPrivateKeyBytesBLST(t *rapid.T) []byte {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381)
 	ikm := randomSlice.Draw(t, "ikm").([]byte)
 	return blst.KeyGen(ikm).Serialize()
 }
 
 // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library
 func validPublicKeyBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte)
+	ikm := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte)
 	blstS := blst.KeyGen(ikm)
 	blstG2 := new(blst.P2Affine).From(blstS)
 	return blstG2.Compress()
@@ -73,7 +80,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte {
 
 // validSignatureBytesBLST generates bytes of a valid signature in BLST library
 func validSignatureBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte)
+	ikm := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte)
 	blstS := blst.KeyGen(ikm[:])
 	blstG1 := new(blst.P1Affine).From(blstS)
 	return blstG1.Compress()
@@ -82,14 +89,14 @@ func validSignatureBytesBLST(t *rapid.T) []byte {
 // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST.
 // This test assumes private key serialization is identical to the one in BLST.
 func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PrKeyLenBLSBLS12381, crypto.PrKeyLenBLSBLS12381)
 	validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST)
 	// skBytes are bytes of either a valid or a random private key
 	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
-	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
+	skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes)
 	var skBLST blst.Scalar
 	res := skBLST.Deserialize(skBytes)
 
@@ -109,14 +116,14 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
 // testEncodeDecodePublicKeyCrossBLST tests encoding and decoding of public keys keys are consistent with BLST.
 // This test assumes public key serialization is identical to the one in BLST.
 func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), PubKeyLenBLSBLS12381, PubKeyLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PubKeyLenBLSBLS12381, crypto.PubKeyLenBLSBLS12381)
 	validSliceFlow := rapid.Custom(validPublicKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPublicKeyBytesBLST)
 	// pkBytes are bytes of either a valid or a random public key
 	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
-	pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes)
+	pkFlow, err := crypto.DecodePublicKey(crypto.BLSBLS12381, pkBytes)
 	var pkBLST blst.P2Affine
 	res := pkBLST.Deserialize(pkBytes)
 	pkValidBLST := pkBLST.KeyValidate()
@@ -137,7 +144,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
 // testEncodeDecodeSignatureCrossBLST tests encoding and decoding of signatures are consistent with BLST.
 // This test assumes signature serialization is identical to the one in BLST.
 func testEncodeDecodeSignatureCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.SignatureLenBLSBLS12381, crypto.SignatureLenBLSBLS12381)
 	validSignatureFlow := rapid.Custom(validSignatureBytesFlow)
 	validSignatureBLST := rapid.Custom(validSignatureBytesBLST)
 	// sigBytes are bytes of either a valid or a random signature
@@ -180,7 +187,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 	// generate two private keys from the same seed
 	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte)
 
-	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
+	skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes)
 	require.NoError(t, err)
 	var skBLST blst.Scalar
 	res := skBLST.Deserialize(skBytes)
@@ -208,4 +215,4 @@ func TestAgainstBLST(t *testing.T) {
 	rapid.Check(t, testEncodeDecodePublicKeyCrossBLST)
 	rapid.Check(t, testEncodeDecodeSignatureCrossBLST)
 	rapid.Check(t, testSignHashCrossBLST)
-}
+}*/
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 1dfe29abc05..a915bed4a64 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -27,7 +27,7 @@ import (
 //  - batch verification of multiple signatures of a single message under multiple
 //  public keys: use a binary tree of aggregations to find the invalid signatures.
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls_include.h"
 import "C"
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 4256af84ab9..4aa73278d3a 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -3,7 +3,7 @@
 
 package crypto
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
 import "C"
 
diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_assembly.S
similarity index 100%
rename from crypto/blst_src/build/assembly.S
rename to crypto/blst_assembly.S
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
new file mode 100644
index 00000000000..586f6069590
--- /dev/null
+++ b/crypto/blst_include.h
@@ -0,0 +1,7 @@
+#ifndef __BLST_INCLUDE_H__
+#define __BLST_INCLUDE_H__
+
+// blst related definitions
+// eventually this file would replace blst.h
+
+#endif
\ No newline at end of file
diff --git a/crypto/blst_src/server.c b/crypto/blst_src.c
similarity index 100%
rename from crypto/blst_src/server.c
rename to crypto/blst_src.c
diff --git a/crypto/blst_src/asm/add_mod_256-armv8.pl b/crypto/blst_src/asm/add_mod_256-armv8.pl
deleted file mode 100755
index 34d9145261b..00000000000
--- a/crypto/blst_src/asm/add_mod_256-armv8.pl
+++ /dev/null
@@ -1,412 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
-
-@mod=map("x$_",(4..7));
-@a=map("x$_",(8..11));
-@b=map("x$_",(12..15));
-@t=map("x$_",(16,17,1..3));
-
-$code.=<<___;
-.text
-
-.globl	add_mod_256
-.hidden	add_mod_256
-.type	add_mod_256,%function
-.align	5
-add_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-
-	 ldp	@a[2],@a[3],[$a_ptr,#16]
-	adds	@a[0],@a[0],@b[0]
-	 ldp	@b[2],@b[3],[$b_ptr,#16]
-	adcs	@a[1],@a[1],@b[1]
-	 ldp	@mod[0],@mod[1],[$n_ptr]
-	adcs	@a[2],@a[2],@b[2]
-	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	adcs	@a[3],@a[3],@b[3]
-	adc	@t[4],xzr,xzr
-
-	subs	@t[0],@a[0],@mod[0]
-	sbcs	@t[1],@a[1],@mod[1]
-	sbcs	@t[2],@a[2],@mod[2]
-	sbcs	@t[3],@a[3],@mod[3]
-	sbcs	xzr,@t[4],xzr
-
-	csel	@a[0],@a[0],@t[0],lo
-	csel	@a[1],@a[1],@t[1],lo
-	csel	@a[2],@a[2],@t[2],lo
-	stp	@a[0],@a[1],[$r_ptr]
-	csel	@a[3],@a[3],@t[3],lo
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	add_mod_256,.-add_mod_256
-
-.globl	mul_by_3_mod_256
-.hidden	mul_by_3_mod_256
-.type	mul_by_3_mod_256,%function
-.align	5
-mul_by_3_mod_256:
-	ldp	@b[0],@b[1],[$a_ptr]
-	ldp	@b[2],@b[3],[$a_ptr,#16]
-
-	adds	@a[0],@b[0],@b[0]
-	 ldp	@mod[0],@mod[1],[$b_ptr]
-	adcs	@a[1],@b[1],@b[1]
-	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	adcs	@a[2],@b[2],@b[2]
-	adcs	@a[3],@b[3],@b[3]
-	adc	@t[4],xzr,xzr
-
-	subs	@t[0],@a[0],@mod[0]
-	sbcs	@t[1],@a[1],@mod[1]
-	sbcs	@t[2],@a[2],@mod[2]
-	sbcs	@t[3],@a[3],@mod[3]
-	sbcs	xzr,@t[4],xzr
-
-	csel	@a[0],@a[0],@t[0],lo
-	csel	@a[1],@a[1],@t[1],lo
-	csel	@a[2],@a[2],@t[2],lo
-	csel	@a[3],@a[3],@t[3],lo
-
-	adds	@a[0],@a[0],@b[0]
-	adcs	@a[1],@a[1],@b[1]
-	adcs	@a[2],@a[2],@b[2]
-	adcs	@a[3],@a[3],@b[3]
-	adc	@t[4],xzr,xzr
-
-	subs	@t[0],@a[0],@mod[0]
-	sbcs	@t[1],@a[1],@mod[1]
-	sbcs	@t[2],@a[2],@mod[2]
-	sbcs	@t[3],@a[3],@mod[3]
-	sbcs	xzr,@t[4],xzr
-
-	csel	@a[0],@a[0],@t[0],lo
-	csel	@a[1],@a[1],@t[1],lo
-	csel	@a[2],@a[2],@t[2],lo
-	stp	@a[0],@a[1],[$r_ptr]
-	csel	@a[3],@a[3],@t[3],lo
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	mul_by_3_mod_256,.-mul_by_3_mod_256
-
-.globl	lshift_mod_256
-.hidden	lshift_mod_256
-.type	lshift_mod_256,%function
-.align	5
-lshift_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-
-.Loop_lshift_mod_256:
-	adds	@a[0],@a[0],@a[0]
-	sub	$b_ptr,$b_ptr,#1
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adc	@t[4],xzr,xzr
-
-	subs	@b[0],@a[0],@mod[0]
-	sbcs	@b[1],@a[1],@mod[1]
-	sbcs	@b[2],@a[2],@mod[2]
-	sbcs	@b[3],@a[3],@mod[3]
-	sbcs	xzr,@t[4],xzr
-
-	csel	@a[0],@a[0],@b[0],lo
-	csel	@a[1],@a[1],@b[1],lo
-	csel	@a[2],@a[2],@b[2],lo
-	csel	@a[3],@a[3],@b[3],lo
-
-	cbnz	$b_ptr,.Loop_lshift_mod_256
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	lshift_mod_256,.-lshift_mod_256
-
-.globl	rshift_mod_256
-.hidden	rshift_mod_256
-.type	rshift_mod_256,%function
-.align	5
-rshift_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-
-.Loop_rshift:
-	adds	@b[0],@a[0],@mod[0]
-	sub	$b_ptr,$b_ptr,#1
-	adcs	@b[1],@a[1],@mod[1]
-	adcs	@b[2],@a[2],@mod[2]
-	adcs	@b[3],@a[3],@mod[3]
-	adc	@t[4],xzr,xzr
-	tst	@a[0],#1
-
-	csel	@b[0],@b[0],@a[0],ne
-	csel	@b[1],@b[1],@a[1],ne
-	csel	@b[2],@b[2],@a[2],ne
-	csel	@b[3],@b[3],@a[3],ne
-	csel	@t[4],@t[4],xzr,ne
-
-	extr	@a[0],@b[1],@b[0],#1
-	extr	@a[1],@b[2],@b[1],#1
-	extr	@a[2],@b[3],@b[2],#1
-	extr	@a[3],@t[4],@b[3],#1
-
-	cbnz	$b_ptr,.Loop_rshift
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	rshift_mod_256,.-rshift_mod_256
-
-.globl	cneg_mod_256
-.hidden	cneg_mod_256
-.type	cneg_mod_256,%function
-.align	5
-cneg_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@mod[0],@mod[1],[$n_ptr]
-
-	 ldp	@a[2],@a[3],[$a_ptr,#16]
-	subs	@b[0],@mod[0],@a[0]
-	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	 orr	@mod[0],@a[0],@a[1]
-	sbcs	@b[1],@mod[1],@a[1]
-	 orr	@mod[1],@a[2],@a[3]
-	sbcs	@b[2],@mod[2],@a[2]
-	 orr	@t[4],@mod[0],@mod[1]
-	sbc	@b[3],@mod[3],@a[3]
-
-	cmp	@t[4],#0
-	csetm	@t[4],ne
-	ands	$b_ptr,$b_ptr,@t[4]
-
-	csel	@a[0],@a[0],@b[0],eq
-	csel	@a[1],@a[1],@b[1],eq
-	csel	@a[2],@a[2],@b[2],eq
-	stp	@a[0],@a[1],[$r_ptr]
-	csel	@a[3],@a[3],@b[3],eq
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	cneg_mod_256,.-cneg_mod_256
-
-.globl	sub_mod_256
-.hidden	sub_mod_256
-.type	sub_mod_256,%function
-.align	5
-sub_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-
-	 ldp	@a[2],@a[3],[$a_ptr,#16]
-	subs	@a[0],@a[0],@b[0]
-	 ldp	@b[2],@b[3],[$b_ptr,#16]
-	sbcs	@a[1],@a[1],@b[1]
-	 ldp	@mod[0],@mod[1],[$n_ptr]
-	sbcs	@a[2],@a[2],@b[2]
-	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	sbcs	@a[3],@a[3],@b[3]
-	sbc	@t[4],xzr,xzr
-
-	 and	@mod[0],@mod[0],@t[4]
-	 and	@mod[1],@mod[1],@t[4]
-	adds	@a[0],@a[0],@mod[0]
-	 and	@mod[2],@mod[2],@t[4]
-	adcs	@a[1],@a[1],@mod[1]
-	 and	@mod[3],@mod[3],@t[4]
-	adcs	@a[2],@a[2],@mod[2]
-	stp	@a[0],@a[1],[$r_ptr]
-	adc	@a[3],@a[3],@mod[3]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ret
-.size	sub_mod_256,.-sub_mod_256
-
-.globl	check_mod_256
-.hidden	check_mod_256
-.type	check_mod_256,%function
-.align	5
-check_mod_256:
-	ldp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@mod[0],@mod[1],[$a_ptr]
-	ldp	@mod[2],@mod[3],[$a_ptr,#16]
-
-#ifdef	__AARCH64EB__
-	rev	@a[0],@a[0]
-	rev	@a[1],@a[1]
-	rev	@a[2],@a[2]
-	rev	@a[3],@a[3]
-#endif
-
-	subs	xzr,@a[0],@mod[0]
-	sbcs	xzr,@a[1],@mod[1]
-	orr	@a[0],@a[0],@a[1]
-	sbcs	xzr,@a[2],@mod[2]
-	orr	@a[0],@a[0],@a[2]
-	sbcs	xzr,@a[3],@mod[3]
-	orr	@a[0],@a[0],@a[3]
-	sbc	$a_ptr,xzr,xzr
-
-	cmp	@a[0],#0
-	mov	x0,#1
-	csel	x0,x0,xzr,ne
-	and	x0,x0,$a_ptr
-
-	ret
-.size	check_mod_256,.-check_mod_256
-
-.globl	add_n_check_mod_256
-.hidden	add_n_check_mod_256
-.type	add_n_check_mod_256,%function
-.align	5
-add_n_check_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@b[2],@b[3],[$b_ptr,#16]
-
-#ifdef	__AARCH64EB__
-	rev	@a[0],@a[0]
-	rev	@b[0],@b[0]
-	rev	@a[1],@a[1]
-	rev	@b[1],@b[1]
-	rev	@a[2],@a[2]
-	rev	@b[2],@b[2]
-	rev	@a[3],@a[3]
-	rev	@b[3],@b[3]
-#endif
-
-	adds	@a[0],@a[0],@b[0]
-	 ldp	@mod[0],@mod[1],[$n_ptr]
-	adcs	@a[1],@a[1],@b[1]
-	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	adcs	@a[2],@a[2],@b[2]
-	adcs	@a[3],@a[3],@b[3]
-	adc	@t[4],xzr,xzr
-
-	subs	@t[0],@a[0],@mod[0]
-	sbcs	@t[1],@a[1],@mod[1]
-	sbcs	@t[2],@a[2],@mod[2]
-	sbcs	@t[3],@a[3],@mod[3]
-	sbcs	xzr,@t[4],xzr
-
-	csel	@a[0],@a[0],@t[0],lo
-	csel	@a[1],@a[1],@t[1],lo
-	csel	@a[2],@a[2],@t[2],lo
-	csel	@a[3],@a[3],@t[3],lo
-
-	orr	@t[0], @a[0], @a[1]
-	orr	@t[1], @a[2], @a[3]
-	orr	@t[0], @t[0], @t[1]
-
-#ifdef	__AARCH64EB__
-	rev	@a[0],@a[0]
-	rev	@a[1],@a[1]
-	rev	@a[2],@a[2]
-	rev	@a[3],@a[3]
-#endif
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	mov	@t[1], #1
-	cmp	@t[0], #0
-	csel	x0, @t[1], xzr, ne
-
-	ret
-.size	add_n_check_mod_256,.-add_n_check_mod_256
-
-.globl	sub_n_check_mod_256
-.hidden	sub_n_check_mod_256
-.type	sub_n_check_mod_256,%function
-.align	5
-sub_n_check_mod_256:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@b[2],@b[3],[$b_ptr,#16]
-
-#ifdef	__AARCH64EB__
-	rev	@a[0],@a[0]
-	rev	@b[0],@b[0]
-	rev	@a[1],@a[1]
-	rev	@b[1],@b[1]
-	rev	@a[2],@a[2]
-	rev	@b[2],@b[2]
-	rev	@a[3],@a[3]
-	rev	@b[3],@b[3]
-#endif
-
-	subs	@a[0],@a[0],@b[0]
-	sbcs	@a[1],@a[1],@b[1]
-	 ldp	@mod[0],@mod[1],[$n_ptr]
-	sbcs	@a[2],@a[2],@b[2]
-	 ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	sbcs	@a[3],@a[3],@b[3]
-	sbc	@t[4],xzr,xzr
-
-	 and	@mod[0],@mod[0],@t[4]
-	 and	@mod[1],@mod[1],@t[4]
-	adds	@a[0],@a[0],@mod[0]
-	 and	@mod[2],@mod[2],@t[4]
-	adcs	@a[1],@a[1],@mod[1]
-	 and	@mod[3],@mod[3],@t[4]
-	adcs	@a[2],@a[2],@mod[2]
-	adc	@a[3],@a[3],@mod[3]
-
-	orr	@t[0], @a[0], @a[1]
-	orr	@t[1], @a[2], @a[3]
-	orr	@t[0], @t[0], @t[1]
-
-#ifdef	__AARCH64EB__
-	rev	@a[0],@a[0]
-	rev	@a[1],@a[1]
-	rev	@a[2],@a[2]
-	rev	@a[3],@a[3]
-#endif
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	mov	@t[1], #1
-	cmp	@t[0], #0
-	csel	x0, @t[1], xzr, ne
-
-	ret
-.size	sub_n_check_mod_256,.-sub_n_check_mod_256
-___
-
-print $code;
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_256-x86_64.pl b/crypto/blst_src/asm/add_mod_256-x86_64.pl
deleted file mode 100755
index 1d656fb90bf..00000000000
--- a/crypto/blst_src/asm/add_mod_256-x86_64.pl
+++ /dev/null
@@ -1,547 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx");
-$b_ptr = "%rbx";
-
-{ ############################################################## 256 bits add
-my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12));
-
-$code.=<<___;
-.text
-
-.globl	add_mod_256
-.hidden	add_mod_256
-.type	add_mod_256,\@function,4,"unwind"
-.align	32
-add_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-.Loaded_a_add_mod_256:
-	add	8*0($b_org), @acc[0]
-	adc	8*1($b_org), @acc[1]
-	 mov	@acc[0], @acc[4]
-	adc	8*2($b_org), @acc[2]
-	 mov	@acc[1], @acc[5]
-	adc	8*3($b_org), @acc[3]
-	sbb	$b_org, $b_org
-
-	 mov	@acc[2], @acc[6]
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], @acc[7]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[4], @acc[0]
-	cmovc	@acc[5], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	@acc[6], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[7], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	add_mod_256,.-add_mod_256
-
-########################################################################
-.globl	mul_by_3_mod_256
-.hidden	mul_by_3_mod_256
-.type	mul_by_3_mod_256,\@function,3,"unwind"
-.align	32
-mul_by_3_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-.cfi_end_prologue
-
-	mov	$b_org,$n_ptr
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	$a_ptr,$b_org
-	mov	8*3($a_ptr), @acc[3]
-
-	call	__lshift_mod_256
-	mov	0(%rsp),%r12
-.cfi_restore	%r12
-	jmp	.Loaded_a_add_mod_256
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_3_mod_256,.-mul_by_3_mod_256
-
-.type	__lshift_mod_256,\@abi-omnipotent
-.align	32
-__lshift_mod_256:
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	 mov	@acc[0], @acc[4]
-	adc	@acc[2], @acc[2]
-	 mov	@acc[1], @acc[5]
-	adc	@acc[3], @acc[3]
-	sbb	@acc[8], @acc[8]
-
-	 mov	@acc[2], @acc[6]
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], @acc[7]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	\$0, @acc[8]
-
-	cmovc	@acc[4], @acc[0]
-	cmovc	@acc[5], @acc[1]
-	cmovc	@acc[6], @acc[2]
-	cmovc	@acc[7], @acc[3]
-
-	ret
-.size	__lshift_mod_256,.-__lshift_mod_256
-
-########################################################################
-.globl	lshift_mod_256
-.hidden	lshift_mod_256
-.type	lshift_mod_256,\@function,4,"unwind"
-.align	32
-lshift_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-.Loop_lshift_mod_256:
-	call	__lshift_mod_256
-	dec	%edx
-	jnz	.Loop_lshift_mod_256
-
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	0(%rsp),%r12
-.cfi_restore	%r12
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	lshift_mod_256,.-lshift_mod_256
-
-########################################################################
-.globl	rshift_mod_256
-.hidden	rshift_mod_256
-.type	rshift_mod_256,\@function,4,"unwind"
-.align	32
-rshift_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[7]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-.Loop_rshift_mod_256:
-	mov	@acc[7], @acc[0]
-	and	\$1, @acc[7]
-	mov	8*0($n_ptr), @acc[4]
-	neg	@acc[7]
-	mov	8*1($n_ptr), @acc[5]
-	mov	8*2($n_ptr), @acc[6]
-
-	and	@acc[7], @acc[4]
-	and	@acc[7], @acc[5]
-	and	@acc[7], @acc[6]
-	and	8*3($n_ptr), @acc[7]
-
-	add	@acc[4], @acc[0]
-	adc	@acc[5], @acc[1]
-	adc	@acc[6], @acc[2]
-	adc	@acc[7], @acc[3]
-	sbb	@acc[4], @acc[4]
-
-	shr	\$1, @acc[0]
-	mov	@acc[1], @acc[7]
-	shr	\$1, @acc[1]
-	mov	@acc[2], @acc[6]
-	shr	\$1, @acc[2]
-	mov	@acc[3], @acc[5]
-	shr	\$1, @acc[3]
-
-	shl	\$63, @acc[7]
-	shl	\$63, @acc[6]
-	or	@acc[0], @acc[7]
-	shl	\$63, @acc[5]
-	or	@acc[6], @acc[1]
-	shl	\$63, @acc[4]
-	or	@acc[5], @acc[2]
-	or	@acc[4], @acc[3]
-
-	dec	%edx
-	jnz	.Loop_rshift_mod_256
-
-	mov	@acc[7], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	rshift_mod_256,.-rshift_mod_256
-
-########################################################################
-.globl	cneg_mod_256
-.hidden	cneg_mod_256
-.type	cneg_mod_256,\@function,4,"unwind"
-.align	32
-cneg_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[8]	# load a[0:3]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	@acc[8], @acc[0]
-	mov	8*3($a_ptr), @acc[3]
-	or	@acc[1], @acc[8]
-	or	@acc[2], @acc[8]
-	or	@acc[3], @acc[8]
-	mov	\$-1, @acc[7]
-
-	mov	8*0($n_ptr), @acc[4]	# load n[0:3]
-	cmovnz	@acc[7], @acc[8]	# mask = a[0:3] ? -1 : 0
-	mov	8*1($n_ptr), @acc[5]
-	mov	8*2($n_ptr), @acc[6]
-	and	@acc[8], @acc[4]	# n[0:3] &= mask
-	mov	8*3($n_ptr), @acc[7]
-	and	@acc[8], @acc[5]
-	and	@acc[8], @acc[6]
-	and	@acc[8], @acc[7]
-
-	sub	@acc[0], @acc[4]	# a[0:3] ? n[0:3]-a[0:3] : 0-0
-	sbb	@acc[1], @acc[5]
-	sbb	@acc[2], @acc[6]
-	sbb	@acc[3], @acc[7]
-
-	or	$b_org, $b_org		# check condition flag
-
-	cmovz	@acc[0], @acc[4]	# flag ? n[0:3]-a[0:3] : a[0:3]
-	cmovz	@acc[1], @acc[5]
-	mov	@acc[4], 8*0($r_ptr)
-	cmovz	@acc[2], @acc[6]
-	mov	@acc[5], 8*1($r_ptr)
-	cmovz	@acc[3], @acc[7]
-	mov	@acc[6], 8*2($r_ptr)
-	mov	@acc[7], 8*3($r_ptr)
-
-	mov	0(%rsp),%r12
-.cfi_restore	%r12
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	cneg_mod_256,.-cneg_mod_256
-
-########################################################################
-.globl	sub_mod_256
-.hidden	sub_mod_256
-.type	sub_mod_256,\@function,4,"unwind"
-.align	32
-sub_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-	sub	8*0($b_org), @acc[0]
-	 mov	8*0($n_ptr), @acc[4]
-	sbb	8*1($b_org), @acc[1]
-	 mov	8*1($n_ptr), @acc[5]
-	sbb	8*2($b_org), @acc[2]
-	 mov	8*2($n_ptr), @acc[6]
-	sbb	8*3($b_org), @acc[3]
-	 mov	8*3($n_ptr), @acc[7]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[4]
-	and	$b_org, @acc[5]
-	and	$b_org, @acc[6]
-	and	$b_org, @acc[7]
-
-	add	@acc[4], @acc[0]
-	adc	@acc[5], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[6], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[7], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sub_mod_256,.-sub_mod_256
-
-########################################################################
-.globl	check_mod_256
-.hidden	check_mod_256
-.type	check_mod_256,\@function,2,"unwind"
-.align	32
-check_mod_256:
-.cfi_startproc
-	mov	8*0($r_ptr), %rax
-	mov	8*1($r_ptr), @acc[1]
-	mov	8*2($r_ptr), @acc[2]
-	mov	8*3($r_ptr), @acc[3]
-
-	mov	%rax, @acc[0]		# see if it's zero
-	or	@acc[1], %rax
-	or	@acc[2], %rax
-	or	@acc[3], %rax
-
-	sub	8*0($a_ptr), @acc[0]	# does subtracting modulus borrow?
-	sbb	8*1($a_ptr), @acc[1]
-	sbb	8*2($a_ptr), @acc[2]
-	sbb	8*3($a_ptr), @acc[3]
-	sbb	$a_ptr, $a_ptr
-
-	mov	\$1, %rdx
-	cmp	\$0, %rax
-	cmovne	%rdx, %rax
-	and	$a_ptr, %rax
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	check_mod_256,.-check_mod_256
-
-########################################################################
-.globl	add_n_check_mod_256
-.hidden	add_n_check_mod_256
-.type	add_n_check_mod_256,\@function,4,"unwind"
-.align	32
-add_n_check_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-	add	8*0($b_org), @acc[0]
-	adc	8*1($b_org), @acc[1]
-	 mov	@acc[0], @acc[4]
-	adc	8*2($b_org), @acc[2]
-	 mov	@acc[1], @acc[5]
-	adc	8*3($b_org), @acc[3]
-	sbb	$b_org, $b_org
-
-	 mov	@acc[2], @acc[6]
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], @acc[7]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[4], @acc[0]
-	cmovc	@acc[5], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	@acc[6], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[7], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	or	@acc[1], @acc[0]
-	or	@acc[3], @acc[2]
-	or	@acc[2], @acc[0]
-	mov	\$1, %rax
-	cmovz	@acc[0], %rax
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	add_n_check_mod_256,.-add_n_check_mod_256
-
-########################################################################
-.globl	sub_n_check_mod_256
-.hidden	sub_n_check_mod_256
-.type	sub_n_check_mod_256,\@function,4,"unwind"
-.align	32
-sub_n_check_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-	sub	8*0($b_org), @acc[0]
-	 mov	8*0($n_ptr), @acc[4]
-	sbb	8*1($b_org), @acc[1]
-	 mov	8*1($n_ptr), @acc[5]
-	sbb	8*2($b_org), @acc[2]
-	 mov	8*2($n_ptr), @acc[6]
-	sbb	8*3($b_org), @acc[3]
-	 mov	8*3($n_ptr), @acc[7]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[4]
-	and	$b_org, @acc[5]
-	and	$b_org, @acc[6]
-	and	$b_org, @acc[7]
-
-	add	@acc[4], @acc[0]
-	adc	@acc[5], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[6], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[7], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	or	@acc[1], @acc[0]
-	or	@acc[3], @acc[2]
-	or	@acc[2], @acc[0]
-	mov	\$1, %rax
-	cmovz	@acc[0], %rax
-
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sub_n_check_mod_256,.-sub_n_check_mod_256
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384-armv8.pl b/crypto/blst_src/asm/add_mod_384-armv8.pl
deleted file mode 100755
index 6accdbb19a1..00000000000
--- a/crypto/blst_src/asm/add_mod_384-armv8.pl
+++ /dev/null
@@ -1,937 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3);
-
-@mod=map("x$_",(4..9));
-@a=map("x$_",(10..15));
-@b=map("x$_",(16,17,19..22));
-$carry=$n_ptr;
-
-$code.=<<___;
-.text
-
-.globl	add_mod_384
-.hidden	add_mod_384
-.type	add_mod_384,%function
-.align	5
-add_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__add_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	add_mod_384,.-add_mod_384
-
-.type	__add_mod_384,%function
-.align	5
-__add_mod_384:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@b[2],@b[3],[$b_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	ldp	@b[4],@b[5],[$b_ptr,#32]
-
-__add_mod_384_ab_are_loaded:
-	adds	@a[0],@a[0],@b[0]
-	adcs	@a[1],@a[1],@b[1]
-	adcs	@a[2],@a[2],@b[2]
-	adcs	@a[3],@a[3],@b[3]
-	adcs	@a[4],@a[4],@b[4]
-	adcs	@a[5],@a[5],@b[5]
-	adc	$carry,xzr,xzr
-
-	subs	@b[0],@a[0],@mod[0]
-	sbcs	@b[1],@a[1],@mod[1]
-	sbcs	@b[2],@a[2],@mod[2]
-	sbcs	@b[3],@a[3],@mod[3]
-	sbcs	@b[4],@a[4],@mod[4]
-	sbcs	@b[5],@a[5],@mod[5]
-	sbcs	xzr,$carry,xzr
-
-	csel	@a[0],@a[0],@b[0],lo
-	csel	@a[1],@a[1],@b[1],lo
-	csel	@a[2],@a[2],@b[2],lo
-	csel	@a[3],@a[3],@b[3],lo
-	csel	@a[4],@a[4],@b[4],lo
-	csel	@a[5],@a[5],@b[5],lo
-
-	ret
-.size	__add_mod_384,.-__add_mod_384
-
-.globl	add_mod_384x
-.hidden	add_mod_384x
-.type	add_mod_384x,%function
-.align	5
-add_mod_384x:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__add_mod_384
-
-	stp	@a[0],@a[1],[$r_ptr]
-	add	$a_ptr,$a_ptr,#48
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	add	$b_ptr,$b_ptr,#48
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	bl	__add_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	add_mod_384x,.-add_mod_384x
-
-.globl	rshift_mod_384
-.hidden	rshift_mod_384
-.type	rshift_mod_384,%function
-.align	5
-rshift_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-.Loop_rshift_mod_384:
-	sub	$b_ptr,$b_ptr,#1
-	bl	__rshift_mod_384
-	cbnz	$b_ptr,.Loop_rshift_mod_384
-
-	ldr	x30,[sp,#8]
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	rshift_mod_384,.-rshift_mod_384
-
-.type	__rshift_mod_384,%function
-.align	5
-__rshift_mod_384:
-	sbfx	@b[5],@a[0],#0,#1
-	 and	@b[0],@b[5],@mod[0]
-	 and	@b[1],@b[5],@mod[1]
-	adds	@a[0],@a[0],@b[0]
-	 and	@b[2],@b[5],@mod[2]
-	adcs	@a[1],@a[1],@b[1]
-	 and	@b[3],@b[5],@mod[3]
-	adcs	@a[2],@a[2],@b[2]
-	 and	@b[4],@b[5],@mod[4]
-	adcs	@a[3],@a[3],@b[3]
-	 and	@b[5],@b[5],@mod[5]
-	adcs	@a[4],@a[4],@b[4]
-	 extr	@a[0],@a[1],@a[0],#1	// a[0:5] >>= 1
-	adcs	@a[5],@a[5],@b[5]
-	 extr	@a[1],@a[2],@a[1],#1
-	adc	@b[5],xzr,xzr
-	 extr	@a[2],@a[3],@a[2],#1
-	 extr	@a[3],@a[4],@a[3],#1
-	 extr	@a[4],@a[5],@a[4],#1
-	 extr	@a[5],@b[5],@a[5],#1
-	ret
-.size	__rshift_mod_384,.-__rshift_mod_384
-
-.globl	div_by_2_mod_384
-.hidden	div_by_2_mod_384
-.type	div_by_2_mod_384,%function
-.align	5
-div_by_2_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__rshift_mod_384
-
-	ldr	x30,[sp,#8]
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	div_by_2_mod_384,.-div_by_2_mod_384
-
-.globl	lshift_mod_384
-.hidden	lshift_mod_384
-.type	lshift_mod_384,%function
-.align	5
-lshift_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-.Loop_lshift_mod_384:
-	sub	$b_ptr,$b_ptr,#1
-	bl	__lshift_mod_384
-	cbnz	$b_ptr,.Loop_lshift_mod_384
-
-	ldr	x30,[sp,#8]
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	lshift_mod_384,.-lshift_mod_384
-
-.type	__lshift_mod_384,%function
-.align	5
-__lshift_mod_384:
-	adds	@a[0],@a[0],@a[0]
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	$carry,xzr,xzr
-
-	subs	@b[0],@a[0],@mod[0]
-	sbcs	@b[1],@a[1],@mod[1]
-	sbcs	@b[2],@a[2],@mod[2]
-	sbcs	@b[3],@a[3],@mod[3]
-	sbcs	@b[4],@a[4],@mod[4]
-	sbcs	@b[5],@a[5],@mod[5]
-	sbcs	xzr,$carry,xzr
-
-	csel	@a[0],@a[0],@b[0],lo
-	csel	@a[1],@a[1],@b[1],lo
-	csel	@a[2],@a[2],@b[2],lo
-	csel	@a[3],@a[3],@b[3],lo
-	csel	@a[4],@a[4],@b[4],lo
-	csel	@a[5],@a[5],@b[5],lo
-
-	ret
-.size	__lshift_mod_384,.-__lshift_mod_384
-
-.globl	mul_by_3_mod_384
-.hidden	mul_by_3_mod_384
-.type	mul_by_3_mod_384,%function
-.align	5
-mul_by_3_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__lshift_mod_384
-
-	ldp	@b[0],@b[1],[$a_ptr]
-	ldp	@b[2],@b[3],[$a_ptr,#16]
-	ldp	@b[4],@b[5],[$a_ptr,#32]
-
-	bl	__add_mod_384_ab_are_loaded
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	mul_by_3_mod_384,.-mul_by_3_mod_384
-
-.globl	mul_by_8_mod_384
-.hidden	mul_by_8_mod_384
-.type	mul_by_8_mod_384,%function
-.align	5
-mul_by_8_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	mul_by_8_mod_384,.-mul_by_8_mod_384
-
-.globl	mul_by_3_mod_384x
-.hidden	mul_by_3_mod_384x
-.type	mul_by_3_mod_384x,%function
-.align	5
-mul_by_3_mod_384x:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__lshift_mod_384
-
-	ldp	@b[0],@b[1],[$a_ptr]
-	ldp	@b[2],@b[3],[$a_ptr,#16]
-	ldp	@b[4],@b[5],[$a_ptr,#32]
-
-	bl	__add_mod_384_ab_are_loaded
-
-	stp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[0],@a[1],[$a_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@a[2],@a[3],[$a_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-	ldp	@a[4],@a[5],[$a_ptr,#80]
-
-	bl	__lshift_mod_384
-
-	ldp	@b[0],@b[1],[$a_ptr,#48]
-	ldp	@b[2],@b[3],[$a_ptr,#64]
-	ldp	@b[4],@b[5],[$a_ptr,#80]
-
-	bl	__add_mod_384_ab_are_loaded
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
-
-.globl	mul_by_8_mod_384x
-.hidden	mul_by_8_mod_384x
-.type	mul_by_8_mod_384x,%function
-.align	5
-mul_by_8_mod_384x:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-
-	stp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[0],@a[1],[$a_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@a[2],@a[3],[$a_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-	ldp	@a[4],@a[5],[$a_ptr,#80]
-
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-	bl	__lshift_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
-
-.globl	cneg_mod_384
-.hidden	cneg_mod_384
-.type	cneg_mod_384,%function
-.align	5
-cneg_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-
-	subs	@b[0],@mod[0],@a[0]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-	 orr	$carry,@a[0],@a[1]
-	sbcs	@b[1],@mod[1],@a[1]
-	 orr	$carry,$carry,@a[2]
-	sbcs	@b[2],@mod[2],@a[2]
-	 orr	$carry,$carry,@a[3]
-	sbcs	@b[3],@mod[3],@a[3]
-	 orr	$carry,$carry,@a[4]
-	sbcs	@b[4],@mod[4],@a[4]
-	 orr	$carry,$carry,@a[5]
-	sbc	@b[5],@mod[5],@a[5]
-
-	cmp	$carry,#0
-	csetm	$carry,ne
-	ands	$b_ptr,$b_ptr,$carry
-
-	csel	@a[0],@a[0],@b[0],eq
-	csel	@a[1],@a[1],@b[1],eq
-	csel	@a[2],@a[2],@b[2],eq
-	csel	@a[3],@a[3],@b[3],eq
-	stp	@a[0],@a[1],[$r_ptr]
-	csel	@a[4],@a[4],@b[4],eq
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	csel	@a[5],@a[5],@b[5],eq
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	cneg_mod_384,.-cneg_mod_384
-
-.globl	sub_mod_384
-.hidden	sub_mod_384
-.type	sub_mod_384,%function
-.align	5
-sub_mod_384:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__sub_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	sub_mod_384,.-sub_mod_384
-
-.type	__sub_mod_384,%function
-.align	5
-__sub_mod_384:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@b[2],@b[3],[$b_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	ldp	@b[4],@b[5],[$b_ptr,#32]
-
-	subs	@a[0],@a[0],@b[0]
-	sbcs	@a[1],@a[1],@b[1]
-	sbcs	@a[2],@a[2],@b[2]
-	sbcs	@a[3],@a[3],@b[3]
-	sbcs	@a[4],@a[4],@b[4]
-	sbcs	@a[5],@a[5],@b[5]
-	sbc	$carry,xzr,xzr
-
-	 and	@b[0],@mod[0],$carry
-	 and	@b[1],@mod[1],$carry
-	adds	@a[0],@a[0],@b[0]
-	 and	@b[2],@mod[2],$carry
-	adcs	@a[1],@a[1],@b[1]
-	 and	@b[3],@mod[3],$carry
-	adcs	@a[2],@a[2],@b[2]
-	 and	@b[4],@mod[4],$carry
-	adcs	@a[3],@a[3],@b[3]
-	 and	@b[5],@mod[5],$carry
-	adcs	@a[4],@a[4],@b[4]
-	adc	@a[5],@a[5],@b[5]
-
-	ret
-.size	__sub_mod_384,.-__sub_mod_384
-
-.globl	sub_mod_384x
-.hidden	sub_mod_384x
-.type	sub_mod_384x,%function
-.align	5
-sub_mod_384x:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__sub_mod_384
-
-	stp	@a[0],@a[1],[$r_ptr]
-	add	$a_ptr,$a_ptr,#48
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	add	$b_ptr,$b_ptr,#48
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	bl	__sub_mod_384
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	sub_mod_384x,.-sub_mod_384x
-
-.globl	mul_by_1_plus_i_mod_384x
-.hidden	mul_by_1_plus_i_mod_384x
-.type	mul_by_1_plus_i_mod_384x,%function
-.align	5
-mul_by_1_plus_i_mod_384x:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-	add	$b_ptr,$a_ptr,#48
-
-	bl	__sub_mod_384			// a->re - a->im
-
-	ldp	@b[0],@b[1],[$a_ptr]
-	ldp	@b[2],@b[3],[$a_ptr,#16]
-	ldp	@b[4],@b[5],[$a_ptr,#32]
-	stp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[0],@a[1],[$a_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@a[2],@a[3],[$a_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-	ldp	@a[4],@a[5],[$a_ptr,#80]
-
-	bl	__add_mod_384_ab_are_loaded	// a->re + a->im
-	ldr	x30,[sp,#8]
-
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
-
-.globl	sgn0_pty_mod_384
-.hidden	sgn0_pty_mod_384
-.type	sgn0_pty_mod_384,%function
-.align	5
-sgn0_pty_mod_384:
-	ldp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$a_ptr]
-	ldp	@mod[2],@mod[3],[$a_ptr,#16]
-	ldp	@mod[4],@mod[5],[$a_ptr,#32]
-
-	and	$r_ptr,@a[0],#1
-	adds	@a[0],@a[0],@a[0]
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	$carry,xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	$carry,$carry,xzr
-
-	mvn	$carry,$carry
-	and	$carry,$carry,#2
-	orr	$r_ptr,$r_ptr,$carry
-
-	ret
-.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
-
-.globl	sgn0_pty_mod_384x
-.hidden	sgn0_pty_mod_384x
-.type	sgn0_pty_mod_384x,%function
-.align	5
-sgn0_pty_mod_384x:
-	ldp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[2],@a[3],[$r_ptr,#16]
-	ldp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$a_ptr]
-	ldp	@mod[2],@mod[3],[$a_ptr,#16]
-	ldp	@mod[4],@mod[5],[$a_ptr,#32]
-
-	and	$b_ptr,@a[0],#1
-	 orr	$n_ptr,@a[0],@a[1]
-	adds	@a[0],@a[0],@a[0]
-	 orr	$n_ptr,$n_ptr,@a[2]
-	adcs	@a[1],@a[1],@a[1]
-	 orr	$n_ptr,$n_ptr,@a[3]
-	adcs	@a[2],@a[2],@a[2]
-	 orr	$n_ptr,$n_ptr,@a[4]
-	adcs	@a[3],@a[3],@a[3]
-	 orr	$n_ptr,$n_ptr,@a[5]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	@b[0],xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	@b[0],@b[0],xzr
-
-	ldp	@a[0],@a[1],[$r_ptr,#48]
-	ldp	@a[2],@a[3],[$r_ptr,#64]
-	ldp	@a[4],@a[5],[$r_ptr,#80]
-
-	mvn	@b[0],@b[0]
-	and	@b[0],@b[0],#2
-	orr	$b_ptr,$b_ptr,@b[0]
-
-	and	$r_ptr,@a[0],#1
-	 orr	$a_ptr,@a[0],@a[1]
-	adds	@a[0],@a[0],@a[0]
-	 orr	$a_ptr,$a_ptr,@a[2]
-	adcs	@a[1],@a[1],@a[1]
-	 orr	$a_ptr,$a_ptr,@a[3]
-	adcs	@a[2],@a[2],@a[2]
-	 orr	$a_ptr,$a_ptr,@a[4]
-	adcs	@a[3],@a[3],@a[3]
-	 orr	$a_ptr,$a_ptr,@a[5]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	@b[0],xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	@b[0],@b[0],xzr
-
-	mvn	@b[0],@b[0]
-	and	@b[0],@b[0],#2
-	orr	$r_ptr,$r_ptr,@b[0]
-
-	cmp	$n_ptr,#0
-	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
-
-	cmp	$a_ptr,#0
-	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
-
-	and	$n_ptr,$n_ptr,#1
-	and	$a_ptr,$a_ptr,#2
-	orr	$r_ptr,$a_ptr,$n_ptr	// pack sign and parity
-
-	ret
-.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
-___
-if (1) {
-sub vec_select {
-my $sz = shift;
-my @v=map("v$_",(0..5,16..21));
-
-$code.=<<___;
-.globl	vec_select_$sz
-.hidden	vec_select_$sz
-.type	vec_select_$sz,%function
-.align	5
-vec_select_$sz:
-	dup	v6.2d, $n_ptr
-	ld1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48
-	cmeq	v6.2d, v6.2d, #0
-	ld1	{@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48
-___
-for($i=0; $i<$sz-48; $i+=48) {
-$code.=<<___;
-	bit	@v[0].16b, @v[3].16b, v6.16b
-	ld1	{@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48
-	bit	@v[1].16b, @v[4].16b, v6.16b
-	ld1	{@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48
-	bit	@v[2].16b, @v[5].16b, v6.16b
-	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48
-___
-	@v = @v[6..11,0..5];
-}
-$code.=<<___;
-	bit	@v[0].16b, @v[3].16b, v6.16b
-	bit	@v[1].16b, @v[4].16b, v6.16b
-	bit	@v[2].16b, @v[5].16b, v6.16b
-	st1	{@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr]
-	ret
-.size	vec_select_$sz,.-vec_select_$sz
-___
-}
-vec_select(32);
-vec_select(48);
-vec_select(96);
-vec_select(192);
-vec_select(144);
-vec_select(288);
-}
-
-{
-my ($inp, $end, $step) = map("x$_", (0..2));
-
-$code.=<<___;
-.globl	vec_prefetch
-.hidden	vec_prefetch
-.type	vec_prefetch,%function
-.align	5
-vec_prefetch:
-	add	$end, $end, $inp
-	sub	$end, $end, #1
-	mov	$step, #64
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	csel	$step, xzr, $step, hi
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	csel	$step, xzr, $step, hi
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	csel	$step, xzr, $step, hi
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	csel	$step, xzr, $step, hi
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	csel	$step, xzr, $step, hi
-	prfm	pldl1keep, [$inp]
-	add	$inp, $inp, $step
-	cmp	$inp, $end
-	csel	$inp, $end, $inp, hi
-	prfm	pldl1keep, [$inp]
-	ret
-.size	vec_prefetch,.-vec_prefetch
-___
-my $len = $end;
-
-$code.=<<___;
-.globl	vec_is_zero_16x
-.hidden	vec_is_zero_16x
-.type	vec_is_zero_16x,%function
-.align	5
-vec_is_zero_16x:
-	ld1	{v0.2d}, [$inp], #16
-	lsr	$len, $len, #4
-	sub	$len, $len, #1
-	cbz	$len, .Loop_is_zero_done
-
-.Loop_is_zero:
-	ld1	{v1.2d}, [$inp], #16
-	orr	v0.16b, v0.16b, v1.16b
-	sub	$len, $len, #1
-	cbnz	$len, .Loop_is_zero
-
-.Loop_is_zero_done:
-	dup	v1.2d, v0.2d[1]
-	orr	v0.16b, v0.16b, v1.16b
-	mov	x1, v0.2d[0]
-	mov	x0, #1
-	cmp	x1, #0
-	csel	x0, x0, xzr, eq
-	ret
-.size	vec_is_zero_16x,.-vec_is_zero_16x
-___
-}
-{
-my ($inp1, $inp2, $len) = map("x$_", (0..2));
-
-$code.=<<___;
-.globl	vec_is_equal_16x
-.hidden	vec_is_equal_16x
-.type	vec_is_equal_16x,%function
-.align	5
-vec_is_equal_16x:
-	ld1	{v0.2d}, [$inp1], #16
-	ld1	{v1.2d}, [$inp2], #16
-	lsr	$len, $len, #4
-	eor	v0.16b, v0.16b, v1.16b
-
-.Loop_is_equal:
-	sub	$len, $len, #1
-	cbz	$len, .Loop_is_equal_done
-	ld1	{v1.2d}, [$inp1], #16
-	ld1	{v2.2d}, [$inp2], #16
-	eor	v1.16b, v1.16b, v2.16b
-	orr	v0.16b, v0.16b, v1.16b
-	b	.Loop_is_equal
-	nop
-
-.Loop_is_equal_done:
-	dup	v1.2d, v0.2d[1]
-	orr	v0.16b, v0.16b, v1.16b
-	mov	x1, v0.2d[0]
-	mov	x0, #1
-	cmp	x1, #0
-	csel	x0, x0, xzr, eq
-	ret
-.size	vec_is_equal_16x,.-vec_is_equal_16x
-___
-}
-
-print $code;
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384-x86_64.pl b/crypto/blst_src/asm/add_mod_384-x86_64.pl
deleted file mode 100755
index a196191c108..00000000000
--- a/crypto/blst_src/asm/add_mod_384-x86_64.pl
+++ /dev/null
@@ -1,1500 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-{ ############################################################## 384 bits add
-my @acc=map("%r$_",(8..15, "ax", "bx", "bp"));
-   push(@acc, $a_ptr);
-
-$code.=<<___;
-.text
-
-.globl	add_mod_384
-.hidden	add_mod_384
-.type	add_mod_384,\@function,4,"unwind"
-.align	32
-add_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__add_mod_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	add_mod_384,.-add_mod_384
-
-.type	__add_mod_384,\@abi-omnipotent
-.align	32
-__add_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-__add_mod_384_a_is_loaded:
-	add	8*0($b_org), @acc[0]
-	adc	8*1($b_org), @acc[1]
-	adc	8*2($b_org), @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	8*3($b_org), @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	8*4($b_org), @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	8*5($b_org), @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	@acc[9],  @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__add_mod_384,.-__add_mod_384
-
-.globl	add_mod_384x
-.hidden	add_mod_384x
-.type	add_mod_384x,\@function,4,"unwind"
-.align	32
-add_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$24, %rsp
-.cfi_adjust_cfa_offset	24
-.cfi_end_prologue
-
-	mov	$a_ptr, 8*0(%rsp)
-	mov	$b_org, 8*1(%rsp)
-	lea	48($a_ptr), $a_ptr	# a->im
-	lea	48($b_org), $b_org	# b->im
-	lea	48($r_ptr), $r_ptr	# ret->im
-	call	__add_mod_384		# add_mod_384(ret->im, a->im, b->im, mod);
-
-	mov	8*0(%rsp), $a_ptr	# a->re
-	mov	8*1(%rsp), $b_org	# b->re
-	lea	-48($r_ptr), $r_ptr	# ret->re
-	call	__add_mod_384		# add_mod_384(ret->re, a->re, b->re, mod);
-
-	mov	24+8*0(%rsp),%r15
-.cfi_restore	%r15
-	mov	24+8*1(%rsp),%r14
-.cfi_restore	%r14
-	mov	24+8*2(%rsp),%r13
-.cfi_restore	%r13
-	mov	24+8*3(%rsp),%r12
-.cfi_restore	%r12
-	mov	24+8*4(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	24+8*5(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24+8*6(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	add_mod_384x,.-add_mod_384x
-
-########################################################################
-.globl	rshift_mod_384
-.hidden	rshift_mod_384
-.type	rshift_mod_384,\@function,4,"unwind"
-.align	32
-rshift_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-.Loop_rshift_mod_384:
-	call	__rshift_mod_384
-	dec	%edx
-	jnz	.Loop_rshift_mod_384
-
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	rshift_mod_384,.-rshift_mod_384
-
-.type	__rshift_mod_384,\@abi-omnipotent
-.align	32
-__rshift_mod_384:
-	mov	\$1, @acc[11]
-	mov	8*0($n_ptr), @acc[6]
-	and	@acc[0], @acc[11]
-	mov	8*1($n_ptr), @acc[7]
-	neg	@acc[11]
-	mov	8*2($n_ptr), @acc[8]
-	and	@acc[11], @acc[6]
-	mov	8*3($n_ptr), @acc[9]
-	and	@acc[11], @acc[7]
-	mov	8*4($n_ptr), @acc[10]
-	and	@acc[11], @acc[8]
-	and	@acc[11], @acc[9]
-	and	@acc[11], @acc[10]
-	and	8*5($n_ptr), @acc[11]
-
-	add	@acc[0], @acc[6]
-	adc	@acc[1], @acc[7]
-	adc	@acc[2], @acc[8]
-	adc	@acc[3], @acc[9]
-	adc	@acc[4], @acc[10]
-	adc	@acc[5], @acc[11]
-	sbb	@acc[5], @acc[5]
-
-	shr	\$1, @acc[6]
-	mov	@acc[7], @acc[0]
-	shr	\$1, @acc[7]
-	mov	@acc[8], @acc[1]
-	shr	\$1, @acc[8]
-	mov	@acc[9], @acc[2]
-	shr	\$1, @acc[9]
-	mov	@acc[10], @acc[3]
-	shr	\$1, @acc[10]
-	mov	@acc[11], @acc[4]
-	shr	\$1, @acc[11]
-	shl	\$63, @acc[0]
-	shl	\$63, @acc[1]
-	or	@acc[6], @acc[0]
-	shl	\$63, @acc[2]
-	or	@acc[7], @acc[1]
-	shl	\$63, @acc[3]
-	or	@acc[8], @acc[2]
-	shl	\$63, @acc[4]
-	or	@acc[9], @acc[3]
-	shl	\$63, @acc[5]
-	or	@acc[10], @acc[4]
-	or	@acc[11], @acc[5]
-
-	ret
-.size	__rshift_mod_384,.-__rshift_mod_384
-
-.globl	div_by_2_mod_384
-.hidden	div_by_2_mod_384
-.type	div_by_2_mod_384,\@function,3,"unwind"
-.align	32
-div_by_2_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	$b_org, $n_ptr
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	call	__rshift_mod_384
-
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	div_by_2_mod_384,.-div_by_2_mod_384
-
-########################################################################
-.globl	lshift_mod_384
-.hidden	lshift_mod_384
-.type	lshift_mod_384,\@function,4,"unwind"
-.align	32
-lshift_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-.Loop_lshift_mod_384:
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	@acc[3], @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	@acc[4], @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	@acc[5], @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$r_ptr, $r_ptr
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $r_ptr
-
-	mov	(%rsp), $r_ptr
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	cmovc	@acc[9],  @acc[3]
-	cmovc	@acc[10], @acc[4]
-	cmovc	@acc[11], @acc[5]
-
-	dec	%edx
-	jnz	.Loop_lshift_mod_384
-
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	lshift_mod_384,.-lshift_mod_384
-
-.type	__lshift_mod_384,\@abi-omnipotent
-.align	32
-__lshift_mod_384:
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	@acc[3], @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	@acc[4], @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	@acc[5], @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	cmovc	@acc[9],  @acc[3]
-	cmovc	@acc[10], @acc[4]
-	cmovc	@acc[11], @acc[5]
-
-	ret
-.size	__lshift_mod_384,.-__lshift_mod_384
-
-########################################################################
-.globl	mul_by_3_mod_384
-.hidden	mul_by_3_mod_384
-.type	mul_by_3_mod_384,\@function,3,"unwind"
-.align	32
-mul_by_3_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$a_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	$b_org, $n_ptr
-
-	call	__lshift_mod_384
-
-	mov	(%rsp), $b_org
-	call	__add_mod_384_a_is_loaded
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_3_mod_384,.-mul_by_3_mod_384
-
-.globl	mul_by_8_mod_384
-.hidden	mul_by_8_mod_384
-.type	mul_by_8_mod_384,\@function,3,"unwind"
-.align	32
-mul_by_8_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	$b_org, $n_ptr
-
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_8_mod_384,.-mul_by_8_mod_384
-
-########################################################################
-.globl	mul_by_3_mod_384x
-.hidden	mul_by_3_mod_384x
-.type	mul_by_3_mod_384x,\@function,3,"unwind"
-.align	32
-mul_by_3_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$a_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	$b_org, $n_ptr
-
-	call	__lshift_mod_384
-
-	mov	(%rsp), $b_org
-	call	__add_mod_384_a_is_loaded
-
-	mov	(%rsp), $a_ptr
-	lea	8*6($r_ptr), $r_ptr
-
-	mov	8*6($a_ptr), @acc[0]
-	mov	8*7($a_ptr), @acc[1]
-	mov	8*8($a_ptr), @acc[2]
-	mov	8*9($a_ptr), @acc[3]
-	mov	8*10($a_ptr), @acc[4]
-	mov	8*11($a_ptr), @acc[5]
-
-	call	__lshift_mod_384
-
-	mov	\$8*6, $b_org
-	add	(%rsp), $b_org
-	call	__add_mod_384_a_is_loaded
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_3_mod_384x,.-mul_by_3_mod_384x
-
-.globl	mul_by_8_mod_384x
-.hidden	mul_by_8_mod_384x
-.type	mul_by_8_mod_384x,\@function,3,"unwind"
-.align	32
-mul_by_8_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$a_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	$b_org, $n_ptr
-
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-
-	mov	(%rsp), $a_ptr
-	mov	@acc[0], 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mov	48+8*0($a_ptr), @acc[0]
-	mov	48+8*1($a_ptr), @acc[1]
-	mov	48+8*2($a_ptr), @acc[2]
-	mov	48+8*3($a_ptr), @acc[3]
-	mov	48+8*4($a_ptr), @acc[4]
-	mov	48+8*5($a_ptr), @acc[5]
-
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-	call	__lshift_mod_384
-
-	mov	@acc[0], 48+8*0($r_ptr)
-	mov	@acc[1], 48+8*1($r_ptr)
-	mov	@acc[2], 48+8*2($r_ptr)
-	mov	@acc[3], 48+8*3($r_ptr)
-	mov	@acc[4], 48+8*4($r_ptr)
-	mov	@acc[5], 48+8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_8_mod_384x,.-mul_by_8_mod_384x
-
-########################################################################
-.globl	cneg_mod_384
-.hidden	cneg_mod_384
-.type	cneg_mod_384,\@function,4,"unwind"
-.align	32
-cneg_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$b_org			# condition flag
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), $b_org	# load a[0:5]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	$b_org, @acc[0]
-	mov	8*3($a_ptr), @acc[3]
-	or	@acc[1], $b_org
-	mov	8*4($a_ptr), @acc[4]
-	or	@acc[2], $b_org
-	mov	8*5($a_ptr), @acc[5]
-	or	@acc[3], $b_org
-	mov	\$-1, @acc[11]
-	or	@acc[4], $b_org
-	or	@acc[5], $b_org
-
-	mov	8*0($n_ptr), @acc[6]	# load n[0:5]
-	cmovnz	@acc[11], $b_org	# mask = a[0:5] ? -1 : 0
-	mov	8*1($n_ptr), @acc[7]
-	mov	8*2($n_ptr), @acc[8]
-	and	$b_org, @acc[6]		# n[0:5] &= mask
-	mov	8*3($n_ptr), @acc[9]
-	and	$b_org, @acc[7]
-	mov	8*4($n_ptr), @acc[10]
-	and	$b_org, @acc[8]
-	mov	8*5($n_ptr), @acc[11]
-	and	$b_org, @acc[9]
-	mov	0(%rsp), $n_ptr		# restore condition flag
-	and	$b_org, @acc[10]
-	and	$b_org, @acc[11]
-
-	sub	@acc[0], @acc[6]	# a[0:5] ? n[0:5]-a[0:5] : 0-0
-	sbb	@acc[1], @acc[7]
-	sbb	@acc[2], @acc[8]
-	sbb	@acc[3], @acc[9]
-	sbb	@acc[4], @acc[10]
-	sbb	@acc[5], @acc[11]
-
-	or	$n_ptr, $n_ptr		# check condition flag
-
-	cmovz	@acc[0], @acc[6]	# flag ? n[0:5]-a[0:5] : a[0:5]
-	cmovz	@acc[1], @acc[7]
-	cmovz	@acc[2], @acc[8]
-	mov	@acc[6], 8*0($r_ptr)
-	cmovz	@acc[3], @acc[9]
-	mov	@acc[7], 8*1($r_ptr)
-	cmovz	@acc[4], @acc[10]
-	mov	@acc[8], 8*2($r_ptr)
-	cmovz	@acc[5], @acc[11]
-	mov	@acc[9], 8*3($r_ptr)
-	mov	@acc[10], 8*4($r_ptr)
-	mov	@acc[11], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	cneg_mod_384,.-cneg_mod_384
-
-########################################################################
-.globl	sub_mod_384
-.hidden	sub_mod_384
-.type	sub_mod_384,\@function,4,"unwind"
-.align	32
-sub_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__sub_mod_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sub_mod_384,.-sub_mod_384
-
-.type	__sub_mod_384,\@abi-omnipotent
-.align	32
-__sub_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	sub	8*0($b_org), @acc[0]
-	 mov	8*0($n_ptr), @acc[6]
-	sbb	8*1($b_org), @acc[1]
-	 mov	8*1($n_ptr), @acc[7]
-	sbb	8*2($b_org), @acc[2]
-	 mov	8*2($n_ptr), @acc[8]
-	sbb	8*3($b_org), @acc[3]
-	 mov	8*3($n_ptr), @acc[9]
-	sbb	8*4($b_org), @acc[4]
-	 mov	8*4($n_ptr), @acc[10]
-	sbb	8*5($b_org), @acc[5]
-	 mov	8*5($n_ptr), @acc[11]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[6]
-	and	$b_org, @acc[7]
-	and	$b_org, @acc[8]
-	and	$b_org, @acc[9]
-	and	$b_org, @acc[10]
-	and	$b_org, @acc[11]
-
-	add	@acc[6], @acc[0]
-	adc	@acc[7], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[8], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[9], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	adc	@acc[10], @acc[4]
-	mov	@acc[3], 8*3($r_ptr)
-	adc	@acc[11], @acc[5]
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__sub_mod_384,.-__sub_mod_384
-
-.globl	sub_mod_384x
-.hidden	sub_mod_384x
-.type	sub_mod_384x,\@function,4,"unwind"
-.align	32
-sub_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$24, %rsp
-.cfi_adjust_cfa_offset	24
-.cfi_end_prologue
-
-	mov	$a_ptr, 8*0(%rsp)
-	mov	$b_org, 8*1(%rsp)
-	lea	48($a_ptr), $a_ptr	# a->im
-	lea	48($b_org), $b_org	# b->im
-	lea	48($r_ptr), $r_ptr	# ret->im
-	call	__sub_mod_384		# sub_mod_384(ret->im, a->im, b->im, mod);
-
-	mov	8*0(%rsp), $a_ptr	# a->re
-	mov	8*1(%rsp), $b_org	# b->re
-	lea	-48($r_ptr), $r_ptr	# ret->re
-	call	__sub_mod_384		# sub_mod_384(ret->re, a->re, b->re, mod);
-
-	mov	24+8*0(%rsp),%r15
-.cfi_restore	%r15
-	mov	24+8*1(%rsp),%r14
-.cfi_restore	%r14
-	mov	24+8*2(%rsp),%r13
-.cfi_restore	%r13
-	mov	24+8*3(%rsp),%r12
-.cfi_restore	%r12
-	mov	24+8*4(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	24+8*5(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24+8*6(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sub_mod_384x,.-sub_mod_384x
-___
-}
-{ ###################################################### ret = a * (1 + i)
-my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx");
-my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp"));
-
-$code.=<<___;
-.globl	mul_by_1_plus_i_mod_384x
-.hidden	mul_by_1_plus_i_mod_384x
-.type	mul_by_1_plus_i_mod_384x,\@function,3,"unwind"
-.align	32
-mul_by_1_plus_i_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$56, %rsp
-.cfi_adjust_cfa_offset	56
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	mov	@acc[0], @acc[6]
-	add	8*6($a_ptr), @acc[0]	# a->re + a->im
-	mov	@acc[1], @acc[7]
-	adc	8*7($a_ptr), @acc[1]
-	mov	@acc[2], @acc[8]
-	adc	8*8($a_ptr), @acc[2]
-	mov	@acc[3], @acc[9]
-	adc	8*9($a_ptr), @acc[3]
-	mov	@acc[4], @acc[10]
-	adc	8*10($a_ptr), @acc[4]
-	mov	@acc[5], @acc[11]
-	adc	8*11($a_ptr), @acc[5]
-	mov	$r_ptr, 8*6(%rsp)	# offload r_ptr
-	sbb	$r_ptr, $r_ptr
-
-	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
-	sbb	8*7($a_ptr), @acc[7]
-	sbb	8*8($a_ptr), @acc[8]
-	sbb	8*9($a_ptr), @acc[9]
-	sbb	8*10($a_ptr), @acc[10]
-	sbb	8*11($a_ptr), @acc[11]
-	sbb	$a_ptr, $a_ptr
-
-	mov	@acc[0], 8*0(%rsp)	# offload a->re + a->im [without carry]
-	 mov	8*0($n_ptr), @acc[0]
-	mov	@acc[1], 8*1(%rsp)
-	 mov	8*1($n_ptr), @acc[1]
-	mov	@acc[2], 8*2(%rsp)
-	 mov	8*2($n_ptr), @acc[2]
-	mov	@acc[3], 8*3(%rsp)
-	 mov	8*3($n_ptr), @acc[3]
-	mov	@acc[4], 8*4(%rsp)
-	 and	$a_ptr, @acc[0]
-	 mov	8*4($n_ptr), @acc[4]
-	mov	@acc[5], 8*5(%rsp)
-	 and	$a_ptr, @acc[1]
-	 mov	8*5($n_ptr), @acc[5]
-	 and	$a_ptr, @acc[2]
-	 and	$a_ptr, @acc[3]
-	 and	$a_ptr, @acc[4]
-	 and	$a_ptr, @acc[5]
-	mov	8*6(%rsp), $a_ptr	# restore r_ptr
-
-	add	@acc[0], @acc[6]
-	 mov	8*0(%rsp), @acc[0]	# restore a->re + a->im
-	adc	@acc[1], @acc[7]
-	 mov	8*1(%rsp), @acc[1]
-	adc	@acc[2], @acc[8]
-	 mov	8*2(%rsp), @acc[2]
-	adc	@acc[3], @acc[9]
-	 mov	8*3(%rsp), @acc[3]
-	adc	@acc[4], @acc[10]
-	 mov	8*4(%rsp), @acc[4]
-	adc	@acc[5], @acc[11]
-	 mov	8*5(%rsp), @acc[5]
-
-	mov	@acc[6], 8*0($a_ptr)	# ret->re = a->re - a->im
-	 mov	@acc[0], @acc[6]
-	mov	@acc[7], 8*1($a_ptr)
-	mov	@acc[8], 8*2($a_ptr)
-	 mov	@acc[1], @acc[7]
-	mov	@acc[9], 8*3($a_ptr)
-	mov	@acc[10], 8*4($a_ptr)
-	 mov	@acc[2], @acc[8]
-	mov	@acc[11], 8*5($a_ptr)
-
-	sub	8*0($n_ptr), @acc[0]
-	 mov	@acc[3], @acc[9]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	 mov	@acc[4], @acc[10]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $r_ptr
-
-	cmovc	@acc[6], @acc[0]
-	cmovc	@acc[7], @acc[1]
-	cmovc	@acc[8], @acc[2]
-	mov	@acc[0], 8*6($a_ptr)	# ret->im = a->re + a->im
-	cmovc	@acc[9], @acc[3]
-	mov	@acc[1], 8*7($a_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2], 8*8($a_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3], 8*9($a_ptr)
-	mov	@acc[4], 8*10($a_ptr)
-	mov	@acc[5], 8*11($a_ptr)
-
-	mov	56+8*0(%rsp),%r15
-.cfi_restore	%r15
-	mov	56+8*1(%rsp),%r14
-.cfi_restore	%r14
-	mov	56+8*2(%rsp),%r13
-.cfi_restore	%r13
-	mov	56+8*3(%rsp),%r12
-.cfi_restore	%r12
-	mov	56+8*4(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	56+8*5(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56+8*6(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x
-___
-}
-{ ######################################################
-my ($r_ptr,$n_ptr) = ("%rdi","%rsi");
-my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp"));
-
-$code.=<<___;
-.globl	sgn0_pty_mod_384
-.hidden	sgn0_pty_mod_384
-.type	sgn0_pty_mod_384,\@function,2,"unwind"
-.align	32
-sgn0_pty_mod_384:
-.cfi_startproc
-.cfi_end_prologue
-	mov	8*0($r_ptr), @acc[0]
-	mov	8*1($r_ptr), @acc[1]
-	mov	8*2($r_ptr), @acc[2]
-	mov	8*3($r_ptr), @acc[3]
-	mov	8*4($r_ptr), @acc[4]
-	mov	8*5($r_ptr), @acc[5]
-
-	xor	%rax, %rax
-	mov	@acc[0], $r_ptr
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	not	%rax			# 2*x > p, which means "negative"
-	and	\$1, $r_ptr
-	and	\$2, %rax
-	or	$r_ptr, %rax		# pack sign and parity
-
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0_pty_mod_384,.-sgn0_pty_mod_384
-
-.globl	sgn0_pty_mod_384x
-.hidden	sgn0_pty_mod_384x
-.type	sgn0_pty_mod_384x,\@function,2,"unwind"
-.align	32
-sgn0_pty_mod_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*6($r_ptr), @acc[0]	# sgn0(a->im)
-	mov	8*7($r_ptr), @acc[1]
-	mov	8*8($r_ptr), @acc[2]
-	mov	8*9($r_ptr), @acc[3]
-	mov	8*10($r_ptr), @acc[4]
-	mov	8*11($r_ptr), @acc[5]
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	lea	0($r_ptr), %rax		# sgn0(a->re)
-	xor	$r_ptr, $r_ptr
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, $r_ptr
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $r_ptr
-
-	mov	@acc[0], 0(%rsp)	# a->im is zero or not
-	not	$r_ptr			# 2*x > p, which means "negative"
-	and	\$1, @acc[7]
-	and	\$2, $r_ptr
-	or	@acc[7], $r_ptr		# pack sign and parity
-
-	mov	8*0(%rax), @acc[0]
-	mov	8*1(%rax), @acc[1]
-	mov	8*2(%rax), @acc[2]
-	mov	8*3(%rax), @acc[3]
-	mov	8*4(%rax), @acc[4]
-	mov	8*5(%rax), @acc[5]
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	xor	%rax, %rax
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	mov	0(%rsp), @acc[6]
-
-	not	%rax			# 2*x > p, which means "negative"
-
-	test	@acc[0], @acc[0]
-	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
-
-	test	@acc[6], @acc[6]
-	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
-
-	and	\$1, @acc[7]
-	and	\$2, %rax
-	or	@acc[7], %rax		# pack sign and parity
-
-	mov	8(%rsp), %rbx
-.cfi_restore	%rbx
-	mov	16(%rsp), %rbp
-.cfi_restore	%rbp
-	lea	24(%rsp), %rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0_pty_mod_384x,.-sgn0_pty_mod_384x
-___
-}
-if (0) {
-my $inp = $win64 ? "%rcx" : "%rdi";
-$code.=<<___;
-.globl	nbits_384
-.hidden	nbits_384
-.type	nbits_384,\@abi-omnipotent
-.align	32
-nbits_384:
-	mov	8*5($inp), %r8
-	mov	8*4($inp), %r9
-	mov	8*3($inp), %r10
-	mov	8*2($inp), %r11
-	mov	\$-1, %rdx
-	mov	\$127, %eax
-	bsr	%r8, %r8
-	cmovnz	%rdx,%r9
-	cmovz	%rax,%r8
-	bsr	%r9, %r9
-	cmovnz	%rdx,%r10
-	cmovz	%rax,%r9
-	xor	\$63,%r8
-	bsr	%r10, %r10
-	cmovnz	%rdx, %r11
-	cmovz	%rax, %r10
-	xor	\$63,%r9
-	add	%r8, %r9
-	mov	8*1($inp), %r8
-	bsr	%r11, %r11
-	cmovnz	%rdx, %r8
-	cmovz	%rax, %r11
-	xor	\$63, %r10
-	add	%r9, %r10
-	mov	8*0($inp), %r9
-	bsr	%r8, %r8
-	cmovnz	%rdx, %r9
-	cmovz	%rax, %r8
-	xor	\$63, %r11
-	add	%r10, %r11
-	bsr	%r9, %r9
-	cmovz	%rax, %r9
-	xor	\$63, %r8
-	add	%r11, %r8
-	xor	\$63, %r9
-	add	%r8, %r9
-	mov	\$384, %eax
-	sub	%r9, %rax
-	ret
-.size	nbits_384,.-nbits_384
-___
-}
-
-if (1) {
-my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d")
-                                          : ("%rdi", "%rsi", "%rdx", "%ecx");
-
-sub vec_select {
-my $sz = shift;
-my $half = $sz/2;
-my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3));
-
-$code.=<<___;
-.globl	vec_select_$sz
-.hidden	vec_select_$sz
-.type	vec_select_$sz,\@abi-omnipotent
-.align	32
-vec_select_$sz:
-	movd	$select, %xmm5
-	pxor	%xmm4,%xmm4
-	pshufd	\$0,%xmm5,%xmm5		# broadcast
-	movdqu	($inp1),$xmm0
-	lea	$half($inp1),$inp1
-	pcmpeqd	%xmm4,%xmm5
-	movdqu	($inp2),$xmm1
-	lea	$half($inp2),$inp2
-	pcmpeqd	%xmm5,%xmm4
-	lea	$half($out),$out
-___
-for($i=0; $i<$sz-16; $i+=16) {
-$code.=<<___;
-	pand	%xmm4,$xmm0
-	movdqu	$i+16-$half($inp1),$xmm2
-	pand	%xmm5,$xmm1
-	movdqu	$i+16-$half($inp2),$xmm3
-	por	$xmm1,$xmm0
-	movdqu	$xmm0,$i-$half($out)
-___
-	($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1);
-}
-$code.=<<___;
-	pand	%xmm4,$xmm0
-	pand	%xmm5,$xmm1
-	por	$xmm1,$xmm0
-	movdqu	$xmm0,$i-$half($out)
-	ret
-.size	vec_select_$sz,.-vec_select_$sz
-___
-}
-vec_select(32);
-vec_select(48);
-vec_select(96);
-vec_select(192);
-vec_select(144);
-vec_select(288);
-}
-
-{
-my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi");
-
-$code.=<<___;
-.globl	vec_prefetch
-.hidden	vec_prefetch
-.type	vec_prefetch,\@abi-omnipotent
-.align	32
-vec_prefetch:
-	leaq		-1($inp,$end), $end
-	mov		\$64, %rax
-	xor		%r8, %r8
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	cmova		%r8, %rax
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	cmova		%r8, %rax
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	cmova		%r8, %rax
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	cmova		%r8, %rax
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	cmova		%r8, %rax
-	prefetchnta	($inp)
-	lea		($inp,%rax), $inp
-	cmp		$end, $inp
-	cmova		$end, $inp
-	prefetchnta	($inp)
-	ret
-.size	vec_prefetch,.-vec_prefetch
-___
-my $len = $win64 ? "%edx" : "%esi";
-
-$code.=<<___;
-.globl	vec_is_zero_16x
-.hidden	vec_is_zero_16x
-.type	vec_is_zero_16x,\@abi-omnipotent
-.align	32
-vec_is_zero_16x:
-	shr		\$4, $len
-	movdqu		($inp), %xmm0
-	lea		16($inp), $inp
-
-.Loop_is_zero:
-	dec		$len
-	jz		.Loop_is_zero_done
-	movdqu		($inp), %xmm1
-	lea		16($inp), $inp
-	por		%xmm1, %xmm0
-	jmp		.Loop_is_zero
-
-.Loop_is_zero_done:
-	pshufd		\$0x4e, %xmm0, %xmm1
-	por		%xmm1, %xmm0
-	movq		%xmm0, %rax
-	inc		$len			# now it's 1
-	test		%rax, %rax
-	cmovnz		$len, %eax
-	xor		\$1, %eax
-	ret
-.size	vec_is_zero_16x,.-vec_is_zero_16x
-___
-}
-{
-my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d")
-                                 : ("%rdi", "%rsi", "%edx");
-$code.=<<___;
-.globl	vec_is_equal_16x
-.hidden	vec_is_equal_16x
-.type	vec_is_equal_16x,\@abi-omnipotent
-.align	32
-vec_is_equal_16x:
-	shr		\$4, $len
-	movdqu		($inp1), %xmm0
-	movdqu		($inp2), %xmm1
-	sub		$inp1, $inp2
-	lea		16($inp1), $inp1
-	pxor		%xmm1, %xmm0
-
-.Loop_is_equal:
-	dec		$len
-	jz		.Loop_is_equal_done
-	movdqu		($inp1), %xmm1
-	movdqu		($inp1,$inp2), %xmm2
-	lea		16($inp1), $inp1
-	pxor		%xmm2, %xmm1
-	por		%xmm1, %xmm0
-	jmp		.Loop_is_equal
-
-.Loop_is_equal_done:
-	pshufd		\$0x4e, %xmm0, %xmm1
-	por		%xmm1, %xmm0
-	movq		%xmm0, %rax
-	inc		$len			# now it's 1
-	test		%rax, %rax
-	cmovnz		$len, %eax
-	xor		\$1, %eax
-	ret
-.size	vec_is_equal_16x,.-vec_is_equal_16x
-___
-}
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl
deleted file mode 100755
index 6ee3cf8760a..00000000000
--- a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-# common accumulator layout
-@acc=map("%r$_",(8..15));
-
-############################################################ 384x384 add/sub
-# Double-width addition/subtraction modulo n<<384, as opposite to
-# naively expected modulo n*n. It works because n<<384 is the actual
-# input boundary condition for Montgomery reduction, not n*n.
-# Just in case, this is duplicated, but only one module is
-# supposed to be linked...
-{
-my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
-						# except for $n_ptr and $r_ptr
-$code.=<<___;
-.text
-
-.type	__add_mod_384x384,\@abi-omnipotent
-.align	32
-__add_mod_384x384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	8*6($a_ptr), @acc[6]
-
-	add	8*0($b_org), @acc[0]
-	mov	8*7($a_ptr), @acc[7]
-	adc	8*1($b_org), @acc[1]
-	mov	8*8($a_ptr), @acc[8]
-	adc	8*2($b_org), @acc[2]
-	mov	8*9($a_ptr), @acc[9]
-	adc	8*3($b_org), @acc[3]
-	mov	8*10($a_ptr), @acc[10]
-	adc	8*4($b_org), @acc[4]
-	mov	8*11($a_ptr), @acc[11]
-	adc	8*5($b_org), @acc[5]
-	 mov	@acc[0], 8*0($r_ptr)
-	adc	8*6($b_org), @acc[6]
-	 mov	@acc[1], 8*1($r_ptr)
-	adc	8*7($b_org), @acc[7]
-	 mov	@acc[2], 8*2($r_ptr)
-	adc	8*8($b_org), @acc[8]
-	 mov	@acc[4], 8*4($r_ptr)
-	 mov	@acc[6], @acc[0]
-	adc	8*9($b_org), @acc[9]
-	 mov	@acc[3], 8*3($r_ptr)
-	 mov	@acc[7], @acc[1]
-	adc	8*10($b_org), @acc[10]
-	 mov	@acc[5], 8*5($r_ptr)
-	 mov	@acc[8], @acc[2]
-	adc	8*11($b_org), @acc[11]
-	 mov	@acc[9], @acc[3]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[7]
-	 mov	@acc[10], @acc[4]
-	sbb	8*2($n_ptr), @acc[8]
-	sbb	8*3($n_ptr), @acc[9]
-	sbb	8*4($n_ptr), @acc[10]
-	 mov	@acc[11], @acc[5]
-	sbb	8*5($n_ptr), @acc[11]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[0], @acc[6]
-	cmovc	@acc[1], @acc[7]
-	cmovc	@acc[2], @acc[8]
-	mov	@acc[6], 8*6($r_ptr)
-	cmovc	@acc[3], @acc[9]
-	mov	@acc[7], 8*7($r_ptr)
-	cmovc	@acc[4], @acc[10]
-	mov	@acc[8], 8*8($r_ptr)
-	cmovc	@acc[5], @acc[11]
-	mov	@acc[9], 8*9($r_ptr)
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__add_mod_384x384,.-__add_mod_384x384
-
-.type	__sub_mod_384x384,\@abi-omnipotent
-.align	32
-__sub_mod_384x384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	8*6($a_ptr), @acc[6]
-
-	sub	8*0($b_org), @acc[0]
-	mov	8*7($a_ptr), @acc[7]
-	sbb	8*1($b_org), @acc[1]
-	mov	8*8($a_ptr), @acc[8]
-	sbb	8*2($b_org), @acc[2]
-	mov	8*9($a_ptr), @acc[9]
-	sbb	8*3($b_org), @acc[3]
-	mov	8*10($a_ptr), @acc[10]
-	sbb	8*4($b_org), @acc[4]
-	mov	8*11($a_ptr), @acc[11]
-	sbb	8*5($b_org), @acc[5]
-	 mov	@acc[0], 8*0($r_ptr)
-	sbb	8*6($b_org), @acc[6]
-	 mov	8*0($n_ptr), @acc[0]
-	 mov	@acc[1], 8*1($r_ptr)
-	sbb	8*7($b_org), @acc[7]
-	 mov	8*1($n_ptr), @acc[1]
-	 mov	@acc[2], 8*2($r_ptr)
-	sbb	8*8($b_org), @acc[8]
-	 mov	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], 8*3($r_ptr)
-	sbb	8*9($b_org), @acc[9]
-	 mov	8*3($n_ptr), @acc[3]
-	 mov	@acc[4], 8*4($r_ptr)
-	sbb	8*10($b_org), @acc[10]
-	 mov	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], 8*5($r_ptr)
-	sbb	8*11($b_org), @acc[11]
-	 mov	8*5($n_ptr), @acc[5]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[0]
-	and	$b_org, @acc[1]
-	and	$b_org, @acc[2]
-	and	$b_org, @acc[3]
-	and	$b_org, @acc[4]
-	and	$b_org, @acc[5]
-
-	add	@acc[0], @acc[6]
-	adc	@acc[1], @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	@acc[2], @acc[8]
-	mov	@acc[7], 8*7($r_ptr)
-	adc	@acc[3], @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	@acc[4], @acc[10]
-	mov	@acc[9], 8*9($r_ptr)
-	adc	@acc[5], @acc[11]
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__sub_mod_384x384,.-__sub_mod_384x384
-
-.globl	add_mod_384x384
-.hidden	add_mod_384x384
-.type	add_mod_384x384,\@function,4,"unwind"
-.align	32
-add_mod_384x384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__add_mod_384x384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	add_mod_384x384,.-add_mod_384x384
-
-.globl	sub_mod_384x384
-.hidden	sub_mod_384x384
-.type	sub_mod_384x384,\@function,4,"unwind"
-.align	32
-sub_mod_384x384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__sub_mod_384x384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sub_mod_384x384,.-sub_mod_384x384
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/arm-xlate.pl b/crypto/blst_src/asm/arm-xlate.pl
deleted file mode 100755
index 35aab37407b..00000000000
--- a/crypto/blst_src/asm/arm-xlate.pl
+++ /dev/null
@@ -1,386 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# ARM assembler distiller/adapter by \@dot-asm.
-
-use strict;
-
-################################################################
-# Recognized "flavour"-s are:
-#
-# linux[32|64]	GNU assembler, effectively pass-through
-# ios[32|64]	global symbols' decorations, PIC tweaks, etc.
-# win[32|64]	Visual Studio armasm-specific directives
-# coff[32|64]	e.g. clang --target=arm-windows ...
-#
-my $flavour = shift;
-   $flavour = "linux" if (!$flavour or $flavour eq "void");
-
-my $output = shift;
-open STDOUT,">$output" || die "can't open $output: $!";
-
-my %GLOBALS;
-my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0;
-my $in_proc;	# used with 'windows' flavour
-
-################################################################
-# directives which need special treatment on different platforms
-################################################################
-my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch
-my $fpu  = sub { } if ($flavour !~ /linux/);       # omit .fpu
-
-my $rodata = sub {
-    SWITCH: for ($flavour) {
-	/linux/		&& return ".section\t.rodata";
-	/ios/		&& return ".section\t__TEXT,__const";
-	/coff/		&& return ".section\t.rdata,\"dr\"";
-	/win/		&& return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8";
-	last;
-    }
-};
-
-my $hidden = sub {
-    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
-} if ($flavour !~ /linux/);
-
-my $comm = sub {
-    my @args = split(/,\s*/,shift);
-    my $name = @args[0];
-    my $global = \$GLOBALS{$name};
-    my $ret;
-
-    if ($flavour =~ /ios32/)	{
-	$ret = ".comm\t_$name,@args[1]\n";
-	$ret .= ".non_lazy_symbol_pointer\n";
-	$ret .= "$name:\n";
-	$ret .= ".indirect_symbol\t_$name\n";
-	$ret .= ".long\t0\n";
-	$ret .= ".previous";
-	$name = "_$name";
-    } elsif ($flavour =~ /win/) {
-	$ret = "\tCOMMON\t|$name|,@args[1]";
-    } elsif ($flavour =~ /coff/) {
-	$ret = ".comm\t$name,@args[1]";
-    } else {
-	$ret = ".comm\t".join(',',@args);
-    }
-
-    $$global = $name;
-    $ret;
-};
-
-my $globl = sub {
-    my $name = shift;
-    my $global = \$GLOBALS{$name};
-    my $ret;
-
-    SWITCH: for ($flavour) {
-	/ios/		&& do { $name = "_$name"; last; };
-	/win/		&& do { $ret = ""; last; };
-    }
-
-    $ret = ".globl	$name" if (!defined($ret));
-    $$global = $name;
-    $ret;
-};
-my $global = $globl;
-
-my $extern = sub {
-    &$globl(@_);
-    if ($flavour =~ /win/) {
-	return "\tEXTERN\t@_";
-    }
-    return;	# return nothing
-};
-
-my $type = sub {
-    my $arg = join(',',@_);
-    my $ret;
-
-    SWITCH: for ($flavour) {
-	/ios32/		&& do { if ($arg =~ /(\w+),\s*%function/) {
-				    $ret = "#ifdef __thumb2__\n" .
-					   ".thumb_func	$1\n" .
-					   "#endif";
-				}
-				last;
-			      };
-	/win/		&& do { if ($arg =~ /(\w+),\s*%(function|object)/) {
-				    my $type = "[DATA]";
-				    if ($2 eq "function") {
-					$in_proc = $1;
-					$type = "[FUNC]";
-				    }
-				    $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type"
-							: "";
-				}
-				last;
-			      };
-	/coff/		&& do { if ($arg =~ /(\w+),\s*%function/) {
-				    $ret = ".def	$1;\n".
-					   ".type	32;\n".
-					   ".endef";
-				}
-				last;
-			      };
-    }
-    return $ret;
-} if ($flavour !~ /linux/);
-
-my $size = sub {
-    if ($in_proc && $flavour =~ /win/) {
-	$in_proc = undef;
-	return "\tENDP";
-    }
-} if ($flavour !~ /linux/);
-
-my $inst = sub {
-    if ($flavour =~ /win/)	{ "\tDCDU\t".join(',',@_); }
-    else			{ ".long\t".join(',',@_);  }
-} if ($flavour !~ /linux/);
-
-my $asciz = sub {
-    my $line = join(",",@_);
-    if ($line =~ /^"(.*)"$/)
-    {	if ($flavour =~ /win/) {
-	    "\tDCB\t$line,0\n\tALIGN\t4";
-	} else {
-	    ".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";
-	}
-    } else {	"";	}
-};
-
-my $align = sub {
-    "\tALIGN\t".2**@_[0];
-} if ($flavour =~ /win/);
-   $align = sub {
-    ".p2align\t".@_[0];
-} if ($flavour =~ /coff/);
-
-my $byte = sub {
-    "\tDCB\t".join(',',@_);
-} if ($flavour =~ /win/);
-
-my $short = sub {
-    "\tDCWU\t".join(',',@_);
-} if ($flavour =~ /win/);
-
-my $word = sub {
-    "\tDCDU\t".join(',',@_);
-} if ($flavour =~ /win/);
-
-my $long = $word if ($flavour =~ /win/);
-
-my $quad = sub {
-    "\tDCQU\t".join(',',@_);
-} if ($flavour =~ /win/);
-
-my $skip = sub {
-    "\tSPACE\t".shift;
-} if ($flavour =~ /win/);
-
-my $code = sub {
-    "\tCODE@_[0]";
-} if ($flavour =~ /win/);
-
-my $thumb = sub {	# .thumb should appear prior .text in source
-    "# define ARM THUMB\n" .
-    "\tTHUMB";
-} if ($flavour =~ /win/);
-
-my $text = sub {
-    "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM");
-} if ($flavour =~ /win/);
-
-my $syntax = sub {} if ($flavour =~ /win/);	# omit .syntax
-
-my $rva = sub {
-    # .rva directive comes in handy only on 32-bit Windows, i.e. it can
-    # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections.
-    # However! Corresponding compilers don't seem to bet on PIC, which
-    # raises the question why would assembler programmer have to jump
-    # through the hoops? But just in case, it would go as following:
-    #
-    #	ldr	r1,.LOPENSSL_armcap
-    #	ldr	r2,.LOPENSSL_armcap+4
-    #	adr	r0,.LOPENSSL_armcap
-    #	bic	r1,r1,#1		; de-thumb-ify link.exe's ideas
-    #	sub	r0,r0,r1		; r0 is image base now
-    #	ldr	r0,[r0,r2]
-    #	...
-    #.LOPENSSL_armcap:
-    #	.rva	.LOPENSSL_armcap	; self-reference
-    #	.rva	OPENSSL_armcap_P	; real target
-    #
-    # Non-position-independent [and ISA-neutral] alternative is so much
-    # simpler:
-    #
-    #	ldr	r0,.LOPENSSL_armcap
-    #	ldr	r0,[r0]
-    #	...
-    #.LOPENSSL_armcap:
-    #	.long	OPENSSL_armcap_P
-    #
-    "\tDCDU\t@_[0]\n\tRELOC\t2"
-} if ($flavour =~ /win(?!64)/);
-
-################################################################
-# some broken instructions in Visual Studio armasm[64]...
-
-my $it = sub {} if ($flavour =~ /win32/);	# omit 'it'
-
-my $ext = sub {
-    "\text8\t".join(',',@_);
-} if ($flavour =~ /win64/);
-
-my $csel = sub {
-    my ($args,$comment) = split(m|\s*//|,shift);
-    my @regs = split(m|,\s*|,$args);
-    my $cond = pop(@regs);
-
-    "\tcsel$cond\t".join(',',@regs);
-} if ($flavour =~ /win64/);
-
-my $csetm = sub {
-    my ($args,$comment) = split(m|\s*//|,shift);
-    my @regs = split(m|,\s*|,$args);
-    my $cond = pop(@regs);
-
-    "\tcsetm$cond\t".join(',',@regs);
-} if ($flavour =~ /win64/);
-
-# ... then conditional branch instructions are also broken, but
-# maintaining all the variants is tedious, so I kludge-fix it
-# elsewhere...
-################################################################
-my $adrp = sub {
-    my ($args,$comment) = split(m|\s*//|,shift);
-    "\tadrp\t$args\@PAGE";
-} if ($flavour =~ /ios64/);
-
-my $paciasp = sub {
-    ($flavour =~ /linux/) ? "\t.inst\t0xd503233f"
-                          : &$inst(0xd503233f);
-};
-
-my $autiasp = sub {
-    ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf"
-                          : &$inst(0xd50323bf);
-};
-
-sub range {
-  my ($r,$sfx,$start,$end) = @_;
-
-    join(",",map("$r$_$sfx",($start..$end)));
-}
-
-sub expand_line {
-  my $line = shift;
-  my @ret = ();
-
-    pos($line)=0;
-
-    while ($line =~ m/\G[^@\/\{\"]*/g) {
-	if ($line =~ m/\G(@|\/\/|$)/gc) {
-	    last;
-	}
-	elsif ($line =~ m/\G\{/gc) {
-	    my $saved_pos = pos($line);
-	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
-	    pos($line) = $saved_pos;
-	    $line =~ m/\G[^\}]*\}/g;
-	}
-	elsif ($line =~ m/\G\"/gc) {
-	    $line =~ m/\G[^\"]*\"/g;
-	}
-    }
-
-    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
-
-    if ($flavour =~ /win/) {
-	# adjust alignment hints, "[rN,:32]" -> "[rN@32]"
-	$line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/;
-	# adjust local labels, ".Lwhatever" -> "|$Lwhatever|"
-	$line =~ s/\.(L\w{2,})/|\$$1|/g;
-	# omit "#:lo12:" on win64
-	$line =~ s/#:lo12://;
-    } elsif ($flavour =~ /coff(?!64)/) {
-	$line =~ s/\.L(\w{2,})/(\$ML$1)/g;
-    } elsif ($flavour =~ /ios64/) {
-	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
-    }
-
-    if ($flavour =~ /64/) {
-	# "vX.Md[N]" -> "vX.d[N]
-	$line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/;
-    }
-
-    return $line;
-}
-
-while(my $line=<>) {
-
-    # fix up assembler-specific commentary delimiter
-    $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/);
-
-    if ($line =~ m/^\s*(#|@|;|\/\/)/)	{ print $line; next; }
-
-    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
-    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
-    $line =~ s|\s+$||;		# ... and at the end
-
-    {
-	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
-	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
-    }
-
-    {
-	$line =~ s|(^[\.\w]+)\:\s*||;
-	my $label = $1;
-	if ($label) {
-	    $label = ($GLOBALS{$label} or $label);
-	    if ($flavour =~ /win/) {
-		$label =~ s|^\.L(?=\w)|\$L|;
-		printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : "");
-	    } else {
-		$label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/);
-		printf "%s:", $label;
-	    }
-	}
-    }
-
-    if ($line !~ m/^[#@;]/) {
-	$line =~ s|^\s*(\.?)(\S+)\s*||;
-	my $c = $1; $c = "\t" if ($c eq "");
-	my $mnemonic = $2;
-	my $opcode;
-	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
-	    $opcode = eval("\$$1_$2");
-	} else {
-	    $opcode = eval("\$$mnemonic");
-	}
-
-	my $arg=expand_line($line);
-
-	if (ref($opcode) eq 'CODE') {
-	    $line = &$opcode($arg);
-	} elsif ($mnemonic)         {
-	    if ($flavour =~ /win64/) {
-		# "b.cond" -> "bcond", kludge-fix:-(
-		$mnemonic =~ s/^b\.([a-z]{2}$)/b$1/;
-	    }
-	    $line = $c.$mnemonic;
-	    $line.= "\t$arg" if ($arg ne "");
-	}
-    }
-
-    print $line if ($line);
-    print "\n";
-}
-
-print "\tEND\n" if ($flavour =~ /win/);
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
deleted file mode 100755
index ced8c6c37e9..00000000000
--- a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl
+++ /dev/null
@@ -1,586 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast Euclidean inversion as suggested in
-# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 -
-# on Cortex-A57.
-#
-# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
-#                                                       const vec256 modx);
-#
-$python_ref.=<<'___';
-def ct_inverse_mod_256(inp, mod):
-    a, u = inp, 1
-    b, v = mod, 0
-
-    k = 31
-    mask = (1 << k) - 1
-
-    for i in range(0, 512 // k - 1):
-        # __ab_approximation_31
-        n = max(a.bit_length(), b.bit_length())
-        if n < 64:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-k-2)) << k)
-            b_ = (b & mask) | ((b >> (n-k-2)) << k)
-
-        # __inner_loop_31
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-
-        # __smul_256_n_shift_by_31
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if a < 0:
-            a, f0, g0 = -a, -f0, -g0
-        if b < 0:
-            b, f1, g1 = -b, -f1, -g1
-
-        # __smul_512x63
-        u, v = u*f0 + v*g0, u*f1 + v*g1
-
-    if 512 % k + k:
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, 512 % k + k):
-            if a & 1:
-                if a < b:
-                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
-                a, f0, g0 = a-b, f0-f1, g0-g1
-            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
-
-        v = u*f1 + v*g1
-
-    mod <<= 512 - mod.bit_length()  # align to the left
-    if v < 0:
-        v += mod
-    if v < 0:
-        v += mod
-    elif v == 1<<512
-        v -= mod
-
-    return v & (2**512 - 1) # to be reduced % mod
-___
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
-my @acc=map("x$_",(4..11));
-my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17));
-my $cnt = $n_ptr;
-my @t = map("x$_",(19..26));
-my ($a_lo, $b_lo) = @acc[3,7];
-
-$frame = 16+2*512;
-
-$code.=<<___;
-.text
-
-.globl	ct_inverse_mod_256
-.type	ct_inverse_mod_256, %function
-.align	5
-ct_inverse_mod_256:
-	paciasp
-	stp	x29, x30, [sp,#-80]!
-	add	x29, sp, #0
-	stp	x19, x20, [sp,#16]
-	stp	x21, x22, [sp,#32]
-	stp	x23, x24, [sp,#48]
-	stp	x25, x26, [sp,#64]
-	sub	sp, sp, #$frame
-
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0]
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
-
-	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
-	and	$in_ptr, $in_ptr, #-512	// in the frame...
-	str	$out_ptr, [sp]
-
-	ldp	@acc[4], @acc[5], [$n_ptr,#8*0]
-	ldp	@acc[6], @acc[7], [$n_ptr,#8*2]
-
-	stp	@acc[0], @acc[1], [$in_ptr,#8*0]	// copy input to |a|
-	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
-	stp	@acc[4], @acc[5], [$in_ptr,#8*4]	// copy modulus to |b|
-	stp	@acc[6], @acc[7], [$in_ptr,#8*6]
-
-	////////////////////////////////////////// first iteration
-	bl	.Lab_approximation_31_256_loaded
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_256_n_shift_by_31
-	str	$f0,[$out_ptr,#8*8]		// initialize |u| with |f0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*4	// pointer to dst |b|
-	bl	__smul_256_n_shift_by_31
-	str	$f0, [$out_ptr,#8*9]		// initialize |v| with |f1|
-
-	////////////////////////////////////////// second iteration
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	bl	__ab_approximation_31_256
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_256_n_shift_by_31
-	mov	$f_, $f0			// corrected |f0|
-	mov	$g_, $g0			// corrected |g0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
-	bl	__smul_256_n_shift_by_31
-
-	ldr	@acc[4], [$in_ptr,#8*8]		// |u|
-	ldr	@acc[5], [$in_ptr,#8*13]	// |v|
-	madd	@acc[0], $f_, @acc[4], xzr	// |u|*|f0|
-	madd	@acc[0], $g_, @acc[5], @acc[0]	// |v|*|g0|
-	str	@acc[0], [$out_ptr,#8*4]
-	asr	@acc[1], @acc[0], #63		// sign extenstion
-	stp	@acc[1], @acc[1], [$out_ptr,#8*5]
-	stp	@acc[1], @acc[1], [$out_ptr,#8*7]
-
-	madd	@acc[0], $f0, @acc[4], xzr	// |u|*|f1|
-	madd	@acc[0], $g0, @acc[5], @acc[0]	// |v|*|g1|
-	str	@acc[0], [$out_ptr,#8*9]
-	asr	@acc[1], @acc[0], #63		// sign extenstion
-	stp	@acc[1], @acc[1], [$out_ptr,#8*10]
-	stp	@acc[1], @acc[1], [$out_ptr,#8*12]
-___
-for($i=2; $i<15; $i++) {
-$code.=<<___;
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	bl	__ab_approximation_31_256
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_256_n_shift_by_31
-	mov	$f_, $f0			// corrected |f0|
-	mov	$g_, $g0			// corrected |g0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |b|
-	bl	__smul_256_n_shift_by_31
-
-	add	$out_ptr, $out_ptr, #8*4	// pointer to destination |u|
-	bl	__smul_256x63
-	adc	@t[3], @t[3], @t[4]
-	str	@t[3], [$out_ptr,#8*4]
-
-	mov	$f_, $f0			// corrected |f1|
-	mov	$g_, $g0			// corrected |g1|
-	add	$out_ptr, $out_ptr, #8*5	// pointer to destination |v|
-	bl	__smul_256x63
-___
-$code.=<<___	if ($i>7);
-	bl	__smul_512x63_tail
-___
-$code.=<<___	if ($i<=7);
-	adc	@t[3], @t[3], @t[4]
-	stp	@t[3], @t[3], [$out_ptr,#8*4]
-	stp	@t[3], @t[3], [$out_ptr,#8*6]
-___
-}
-$code.=<<___;
-	////////////////////////////////////////// two[!] last iterations
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	mov	$cnt, #47			// 31 + 512 % 31
-	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
-	ldr	$a_lo, [$in_ptr,#8*0]		// just load
-	ldr	$b_lo, [$in_ptr,#8*4]
-	bl	__inner_loop_62_256
-
-	mov	$f_, $f1
-	mov	$g_, $g1
-	ldr	$out_ptr, [sp]			// original out_ptr
-	bl	__smul_256x63
-	bl	__smul_512x63_tail
-	ldr	x30, [x29,#8]
-
-	smulh	@t[1], @acc[3], $g_		// figure out top-most limb
-	ldp	@acc[4], @acc[5], [$nx_ptr,#8*0]
-	adc	@t[4], @t[4], @t[6]
-	ldp	@acc[6], @acc[7], [$nx_ptr,#8*2]
-
-	add	@t[1], @t[1], @t[4]		// @t[1] is 1, 0 or -1
-	asr	@t[0], @t[1], #63		// sign as mask
-
-	and	@t[4],   @acc[4], @t[0]		// add mod<<256 conditionally
-	and	@t[5],   @acc[5], @t[0]
-	adds	@acc[0], @acc[0], @t[4]
-	and	@t[6],   @acc[6], @t[0]
-	adcs	@acc[1], @acc[1], @t[5]
-	and	@t[7],   @acc[7], @t[0]
-	adcs	@acc[2], @acc[2], @t[6]
-	adcs	@acc[3], @t[3],   @t[7]
-	adc	@t[1], @t[1], xzr		// @t[1] is 1, 0 or -1
-
-	neg	@t[0], @t[1]
-	orr	@t[1], @t[1], @t[0]		// excess bit or sign as mask
-	asr	@t[0], @t[0], #63		// excess bit as mask
-
-	and	@acc[4], @acc[4], @t[1]		// mask |mod|
-	and	@acc[5], @acc[5], @t[1]
-	and	@acc[6], @acc[6], @t[1]
-	and	@acc[7], @acc[7], @t[1]
-
-	eor	@acc[4], @acc[4], @t[0]		// conditionally negate |mod|
-	eor	@acc[5], @acc[5], @t[0]
-	adds	@acc[4], @acc[4], @t[0], lsr#63
-	eor	@acc[6], @acc[6], @t[0]
-	adcs	@acc[5], @acc[5], xzr
-	eor	@acc[7], @acc[7], @t[0]
-	adcs	@acc[6], @acc[6], xzr
-	adc	@acc[7], @acc[7], xzr
-
-	adds	@acc[0], @acc[0], @acc[4]	// final adjustment for |mod|<<256
-	adcs	@acc[1], @acc[1], @acc[5]
-	adcs	@acc[2], @acc[2], @acc[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
-	adc	@acc[3], @acc[3], @acc[7]
-	stp	@acc[2], @acc[3], [$out_ptr,#8*6]
-
-	add	sp, sp, #$frame
-	ldp	x19, x20, [x29,#16]
-	ldp	x21, x22, [x29,#32]
-	ldp	x23, x24, [x29,#48]
-	ldp	x25, x26, [x29,#64]
-	ldr	x29, [sp],#80
-	autiasp
-	ret
-.size	ct_inverse_mod_256,.-ct_inverse_mod_256
-
-////////////////////////////////////////////////////////////////////////
-.type	__smul_256x63, %function
-.align	5
-__smul_256x63:
-___
-for($j=0; $j<2; $j++) {
-my $f_ = $f_;   $f_ = $g_          if ($j);
-my @acc = @acc; @acc = @acc[4..7]  if ($j);
-my $k = 8*8+8*5*$j;
-$code.=<<___;
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
-	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
-	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
-	ldr	@t[3+$j], [$in_ptr,#8*4+$k]
-
-	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
-	sub	$f_, $f_, $f1
-	eor	@acc[1], @acc[1], $f1
-	adds	@acc[0], @acc[0], $f1, lsr#63
-	eor	@acc[2], @acc[2], $f1
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], $f1
-	adcs	@acc[2], @acc[2], xzr
-	eor	@t[3+$j], @t[3+$j], $f1
-	 umulh	@t[0], @acc[0], $f_
-	adcs	@acc[3], @acc[3], xzr
-	 umulh	@t[1], @acc[1], $f_
-	adcs	@t[3+$j], @t[3+$j], xzr
-	 umulh	@t[2], @acc[2], $f_
-___
-$code.=<<___	if ($j!=0);
-	adc	$g1, xzr, xzr		// used in __smul_512x63_tail
-___
-$code.=<<___;
-	mul	@acc[0], @acc[0], $f_
-	 cmp	$f_, #0
-	mul	@acc[1], @acc[1], $f_
-	 csel	@t[3+$j], @t[3+$j], xzr, ne
-	mul	@acc[2], @acc[2], $f_
-	adds	@acc[1], @acc[1], @t[0]
-	mul	@t[5+$j], @acc[3], $f_
-	adcs	@acc[2], @acc[2], @t[1]
-	adcs	@t[5+$j], @t[5+$j], @t[2]
-___
-$code.=<<___	if ($j==0);
-	adc	@t[7], xzr, xzr
-___
-}
-$code.=<<___;
-	adc	@t[7], @t[7], xzr
-
-	adds	@acc[0], @acc[0], @acc[4]
-	adcs	@acc[1], @acc[1], @acc[5]
-	adcs	@acc[2], @acc[2], @acc[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
-	adcs	@t[5],   @t[5],   @t[6]
-	stp	@acc[2], @t[5], [$out_ptr,#8*2]
-
-	ret
-.size	__smul_256x63,.-__smul_256x63
-
-.type	__smul_512x63_tail, %function
-.align	5
-__smul_512x63_tail:
-	umulh	@t[5], @acc[3], $f_
-	ldp	@acc[1], @acc[2], [$in_ptr,#8*18]	// load rest of |v|
-	adc	@t[7], @t[7], xzr
-	ldr	@acc[3], [$in_ptr,#8*20]
-	and	@t[3], @t[3], $f_
-
-	umulh	@acc[7], @acc[7], $g_	// resume |v|*|g1| chain
-
-	sub	@t[5], @t[5], @t[3]	// tie up |u|*|f1| chain
-	asr	@t[6], @t[5], #63
-
-	eor	@acc[1], @acc[1], $f1	// conditionally negate rest of |v|
-	eor	@acc[2], @acc[2], $f1
-	adds	@acc[1], @acc[1], $g1
-	eor	@acc[3], @acc[3], $f1
-	adcs	@acc[2], @acc[2], xzr
-	 umulh	@t[0], @t[4],   $g_
-	adc	@acc[3], @acc[3], xzr
-	 umulh	@t[1], @acc[1], $g_
-	add	@acc[7], @acc[7], @t[7]
-	 umulh	@t[2], @acc[2], $g_
-
-	mul	@acc[0], @t[4],   $g_
-	mul	@acc[1], @acc[1], $g_
-	adds	@acc[0], @acc[0], @acc[7]
-	mul	@acc[2], @acc[2], $g_
-	adcs	@acc[1], @acc[1], @t[0]
-	mul	@t[3],   @acc[3], $g_
-	adcs	@acc[2], @acc[2], @t[1]
-	adcs	@t[3],   @t[3],   @t[2]
-	adc	@t[4], xzr, xzr		// used in the final step
-
-	adds	@acc[0], @acc[0], @t[5]
-	adcs	@acc[1], @acc[1], @t[6]
-	adcs	@acc[2], @acc[2], @t[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*4]
-	adcs	@t[3],   @t[3],   @t[6]	// carry is used in the final step
-	stp	@acc[2], @t[3],   [$out_ptr,#8*6]
-
-	ret
-.size	__smul_512x63_tail,.-__smul_512x63_tail
-
-.type	__smul_256_n_shift_by_31, %function
-.align	5
-__smul_256_n_shift_by_31:
-___
-for($j=0; $j<2; $j++) {
-my $f0 = $f0;   $f0 = $g0           if ($j);
-my @acc = @acc; @acc = @acc[4..7]   if ($j);
-my $k = 8*4*$j;
-$code.=<<___;
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
-	asr	@t[5], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
-	eor	@t[6], $f0, @t[5]	// conditionally negate |f0| (or |g0|)
-
-	eor	@acc[0], @acc[0], @t[5]	// conditionally negate |a| (or |b|)
-	sub	@t[6], @t[6], @t[5]
-	eor	@acc[1], @acc[1], @t[5]
-	adds	@acc[0], @acc[0], @t[5], lsr#63
-	eor	@acc[2], @acc[2], @t[5]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[5]
-	 umulh	@t[0], @acc[0], @t[6]
-	adcs	@acc[2], @acc[2], xzr
-	 umulh	@t[1], @acc[1], @t[6]
-	adc	@acc[3], @acc[3], xzr
-	 umulh	@t[2], @acc[2], @t[6]
-	and	@t[5], @t[5], @t[6]
-	 umulh	@t[3+$j], @acc[3], @t[6]
-	neg	@t[5], @t[5]
-
-	mul	@acc[0], @acc[0], @t[6]
-	mul	@acc[1], @acc[1], @t[6]
-	mul	@acc[2], @acc[2], @t[6]
-	adds	@acc[1], @acc[1], @t[0]
-	mul	@acc[3], @acc[3], @t[6]
-	adcs	@acc[2], @acc[2], @t[1]
-	adcs	@acc[3], @acc[3], @t[2]
-	adc	@t[3+$j], @t[3+$j], @t[5]
-___
-}
-$code.=<<___;
-	adds	@acc[0], @acc[0], @acc[4]
-	adcs	@acc[1], @acc[1], @acc[5]
-	adcs	@acc[2], @acc[2], @acc[6]
-	adcs	@acc[3], @acc[3], @acc[7]
-	adc	@acc[4], @t[3],   @t[4]
-
-	extr	@acc[0], @acc[1], @acc[0], #31
-	extr	@acc[1], @acc[2], @acc[1], #31
-	extr	@acc[2], @acc[3], @acc[2], #31
-	asr	@t[4], @acc[4], #63	// result's sign as mask
-	extr	@acc[3], @acc[4], @acc[3], #31
-
-	eor	@acc[0], @acc[0], @t[4]	// ensure the result is positive
-	eor	@acc[1], @acc[1], @t[4]
-	adds	@acc[0], @acc[0], @t[4], lsr#63
-	eor	@acc[2], @acc[2], @t[4]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[4]
-	adcs	@acc[2], @acc[2], xzr
-	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
-	adc	@acc[3], @acc[3], xzr
-	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
-
-	eor	$f0, $f0, @t[4]		// adjust |f/g| accordingly
-	eor	$g0, $g0, @t[4]
-	sub	$f0, $f0, @t[4]
-	sub	$g0, $g0, @t[4]
-
-	ret
-.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
-___
-
-{
-my @a = @acc[0..3];
-my @b = @acc[4..7];
-my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]);
-
-$code.=<<___;
-.type	__ab_approximation_31_256, %function
-.align	4
-__ab_approximation_31_256:
-	ldp	@a[2], @a[3], [$in_ptr,#8*2]
-	ldp	@b[2], @b[3], [$in_ptr,#8*6]
-	ldp	@a[0], @a[1], [$in_ptr,#8*0]
-	ldp	@b[0], @b[1], [$in_ptr,#8*4]
-
-.Lab_approximation_31_256_loaded:
-	orr	@t[0], @a[3], @b[3]	// check top-most limbs, ...
-	cmp	@t[0], #0
-	csel	@a[3], @a[3], @a[2], ne
-	csel	@b[3], @b[3], @b[2], ne
-	csel	@a[2], @a[2], @a[1], ne
-	orr	@t[0], @a[3], @b[3]	// and ones before top-most, ...
-	csel	@b[2], @b[2], @b[1], ne
-
-	cmp	@t[0], #0
-	csel	@a[3], @a[3], @a[2], ne
-	csel	@b[3], @b[3], @b[2], ne
-	csel	@a[2], @a[2], @a[0], ne
-	orr	@t[0], @a[3], @b[3]	// and one more, ...
-	csel	@b[2], @b[2], @b[0], ne
-
-	clz	@t[0], @t[0]
-	cmp	@t[0], #64
-	csel	@t[0], @t[0], xzr, ne
-	csel	@a[3], @a[3], @a[2], ne
-	csel	@b[3], @b[3], @b[2], ne
-	neg	@t[1], @t[0]
-
-	lslv	@a[3], @a[3], @t[0]	// align high limbs to the left
-	lslv	@b[3], @b[3], @t[0]
-	lsrv	@a[2], @a[2], @t[1]
-	lsrv	@b[2], @b[2], @t[1]
-	and	@a[2], @a[2], @t[1], asr#6
-	and	@b[2], @b[2], @t[1], asr#6
-	orr	$a_lo, @a[3], @a[2]
-	orr	$b_lo, @b[3], @b[2]
-
-	bfxil	$a_lo, @a[0], #0, #31
-	bfxil	$b_lo, @b[0], #0, #31
-
-	b	__inner_loop_31_256
-	ret
-.size	__ab_approximation_31_256,.-__ab_approximation_31_256
-
-.type	__inner_loop_31_256, %function
-.align	4
-__inner_loop_31_256:
-	mov	$cnt, #31
-	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
-	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
-	mov	$bias,#0x7FFFFFFF7FFFFFFF
-
-.Loop_31_256:
-	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
-	sub	$cnt, $cnt, #1
-	and	@t[0], $b_lo, @t[3]
-	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
-	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
-	mov	@t[0], $fg1
-	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
-	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
-	csel	$fg1, $fg1, $fg0,    hs	// exchange |fg0| and |fg1|
-	csel	$fg0, $fg0, @t[0],   hs
-	lsr	$a_lo, $a_lo, #1
-	and	@t[0], $fg1, @t[3]
-	and	@t[1], $bias, @t[3]
-	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	add	$fg1, $fg1, $fg1	// |f1|<<=1
-	add	$fg0, $fg0, @t[1]
-	sub	$fg1, $fg1, $bias
-	cbnz	$cnt, .Loop_31_256
-
-	mov	$bias, #0x7FFFFFFF
-	ubfx	$f0, $fg0, #0, #32
-	ubfx	$g0, $fg0, #32, #32
-	ubfx	$f1, $fg1, #0, #32
-	ubfx	$g1, $fg1, #32, #32
-	sub	$f0, $f0, $bias		// remove bias
-	sub	$g0, $g0, $bias
-	sub	$f1, $f1, $bias
-	sub	$g1, $g1, $bias
-
-	ret
-.size	__inner_loop_31_256,.-__inner_loop_31_256
-
-.type	__inner_loop_62_256, %function
-.align	4
-__inner_loop_62_256:
-	mov	$f0, #1		// |f0|=1
-	mov	$g0, #0		// |g0|=0
-	mov	$f1, #0		// |f1|=0
-	mov	$g1, #1		// |g1|=1
-
-.Loop_62_256:
-	sbfx	@t[3], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
-	sub	$cnt, $cnt, #1
-	and	@t[0], $b_lo, @t[3]
-	sub	@t[1], $b_lo, $a_lo	// |b_|-|a_|
-	subs	@t[2], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
-	mov	@t[0], $f0
-	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
-	csel	$a_lo, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
-	mov	@t[1], $g0
-	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
-	csel	$f1, $f1, @t[0],     hs
-	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
-	csel	$g1, $g1, @t[1],     hs
-	lsr	$a_lo, $a_lo, #1
-	and	@t[0], $f1, @t[3]
-	and	@t[1], $g1, @t[3]
-	add	$f1, $f1, $f1		// |f1|<<=1
-	add	$g1, $g1, $g1		// |g1|<<=1
-	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
-	cbnz	$cnt, .Loop_62_256
-
-	ret
-.size	__inner_loop_62_256,.-__inner_loop_62_256
-___
-}
-
-foreach(split("\n",$code)) {
-    s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/;
-    print $_,"\n";
-}
-close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
deleted file mode 100755
index 24ab5452930..00000000000
--- a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl
+++ /dev/null
@@ -1,837 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast Euclidean inversion as suggested in
-# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake.
-#
-# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod,
-#                                                       const vec256 modx);
-#
-$python_ref.=<<'___';
-def ct_inverse_mod_256(inp, mod):
-    a, u = inp, 1
-    b, v = mod, 0
-
-    k = 31
-    mask = (1 << k) - 1
-
-    for i in range(0, 512 // k - 1):
-        # __ab_approximation_31
-        n = max(a.bit_length(), b.bit_length())
-        if n < 64:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-k-2)) << k)
-            b_ = (b & mask) | ((b >> (n-k-2)) << k)
-
-        # __inner_loop_31
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-
-        # __smulq_256_n_shift_by_31
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if a < 0:
-            a, f0, g0 = -a, -f0, -g0
-        if b < 0:
-            b, f1, g1 = -b, -f1, -g1
-
-        # __smulq_512x63
-        u, v = u*f0 + v*g0, u*f1 + v*g1
-
-    if 512 % k + k:
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, 512 % k + k):
-            if a & 1:
-                if a < b:
-                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
-                a, f0, g0 = a-b, f0-f1, g0-g1
-            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
-
-        v = u*f1 + v*g1
-
-    mod <<= 512 - mod.bit_length()  # align to the left
-    if v < 0:
-        v += mod
-    if v < 0:
-        v += mod
-    elif v == 1<<512
-        v -= mod
-
-    return v & (2**512 - 1) # to be reduced % mod
-___
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
-my @acc = map("%r$_",(8..15));
-my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
-my $cnt = "%edx";
-
-$frame = 8*6+2*512;
-
-$code.=<<___;
-.text
-
-.globl	ct_inverse_mod_256
-.type	ct_inverse_mod_256,\@function,4,"unwind"
-.align	32
-ct_inverse_mod_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	8*6+511(%rsp), %rax	# find closest 512-byte-aligned spot
-	and	\$-512, %rax		# in the frame...
-	mov	$out_ptr, 8*4(%rsp)
-	mov	$nx_ptr,  8*5(%rsp)
-
-	mov	8*0($in_ptr), @acc[0]	# load input
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-
-	mov	8*0($n_ptr), @acc[4]	# load modulus
-	mov	8*1($n_ptr), @acc[5]
-	mov	8*2($n_ptr), @acc[6]
-	mov	8*3($n_ptr), @acc[7]
-
-	mov	@acc[0], 8*0(%rax)	# copy input to |a|
-	mov	@acc[1], 8*1(%rax)
-	mov	@acc[2], 8*2(%rax)
-	mov	@acc[3], 8*3(%rax)
-
-	mov	@acc[4], 8*4(%rax)	# copy modulus to |b|
-	mov	@acc[5], 8*5(%rax)
-	mov	@acc[6], 8*6(%rax)
-	mov	@acc[7], 8*7(%rax)
-	mov	%rax, $in_ptr
-
-	################################# first iteration
-	mov	\$31, $cnt
-	call	__ab_approximation_31_256
-	#mov	$f0, 8*0(%rsp)
-	#mov	$g0, 8*1(%rsp)
-	mov	$f1, 8*2(%rsp)
-	mov	$g1, 8*3(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_256_n_shift_by_31
-	#mov	$f0, 8*0(%rsp)		# corrected |f0|
-	#mov	$g0, 8*1(%rsp)		# corrected |g0|
-	mov	$f0, 8*8($out_ptr)	# initialize |u| with |f0|
-
-	mov	8*2(%rsp), $f0		# |f1|
-	mov	8*3(%rsp), $g0		# |g1|
-	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_256_n_shift_by_31
-	#mov	$f0, 8*2(%rsp)		# corrected |f1|
-	#mov	$g0, 8*3(%rsp)		# corrected |g1|
-	mov	$f0, 8*9($out_ptr)	# initialize |v| with |f1|
-
-	################################# second iteration
-	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
-	mov	\$31, $cnt
-	call	__ab_approximation_31_256
-	#mov	$f0, 8*0(%rsp)
-	#mov	$g0, 8*1(%rsp)
-	mov	$f1, 8*2(%rsp)
-	mov	$g1, 8*3(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_256_n_shift_by_31
-	mov	$f0, 8*0(%rsp)		# corrected |f0|
-	mov	$g0, 8*1(%rsp)		# corrected |g0|
-
-	mov	8*2(%rsp), $f0		# |f1|
-	mov	8*3(%rsp), $g0		# |g1|
-	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_256_n_shift_by_31
-	#mov	$f0, 8*2(%rsp)		# corrected |f1|
-	#mov	$g0, 8*3(%rsp)		# corrected |g1|
-
-	mov	8*8($in_ptr),  @acc[0]	# |u|
-	mov	8*13($in_ptr), @acc[4]	# |v|
-	mov	@acc[0], @acc[1]
-	imulq	8*0(%rsp), @acc[0]	# |u|*|f0|
-	mov	@acc[4], @acc[5]
-	imulq	8*1(%rsp), @acc[4]	# |v|*|g0|
-	add	@acc[4], @acc[0]
-	mov	@acc[0], 8*4($out_ptr)	# destination |u|
-	sar	\$63, @acc[0]		# sign extension
-	mov	@acc[0], 8*5($out_ptr)
-	mov	@acc[0], 8*6($out_ptr)
-	mov	@acc[0], 8*7($out_ptr)
-	mov	@acc[0], 8*8($out_ptr)
-	lea	8*8($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
-
-	imulq	$f0, @acc[1]		# |u|*|f1|
-	imulq	$g0, @acc[5]		# |v|*|g1|
-	add	@acc[5], @acc[1]
-	mov	@acc[1], 8*9($out_ptr)	# destination |v|
-	sar	\$63, @acc[1]		# sign extension
-	mov	@acc[1], 8*10($out_ptr)
-	mov	@acc[1], 8*11($out_ptr)
-	mov	@acc[1], 8*12($out_ptr)
-	mov	@acc[1], 8*13($out_ptr)
-___
-for($i=2; $i<15; $i++) {
-my $smul_512x63  = $i>8  ? "__smulq_512x63"
-                         : "__smulq_256x63";
-$code.=<<___;
-	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$31, $cnt
-	call	__ab_approximation_31_256
-	#mov	$f0, 8*0(%rsp)
-	#mov	$g0, 8*1(%rsp)
-	mov	$f1, 8*2(%rsp)
-	mov	$g1, 8*3(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_256_n_shift_by_31
-	mov	$f0, 8*0(%rsp)		# corrected |f0|
-	mov	$g0, 8*1(%rsp)		# corrected |g0|
-
-	mov	8*2(%rsp), $f0		# |f1|
-	mov	8*3(%rsp), $g0		# |g1|
-	lea	8*4($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_256_n_shift_by_31
-	mov	$f0, 8*2(%rsp)		# corrected |f1|
-	mov	$g0, 8*3(%rsp)		# corrected |g1|
-
-	mov	8*0(%rsp), $f0		# |f0|
-	mov	8*1(%rsp), $g0		# |g0|
-	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
-	lea	8*4($out_ptr), $out_ptr	# pointer to destination |u|
-	call	__smulq_256x63
-
-	mov	8*2(%rsp), $f0		# |f1|
-	mov	8*3(%rsp), $g0		# |g1|
-	lea	8*5($out_ptr),$out_ptr	# pointer to destination |v|
-	call	$smul_512x63
-___
-$code.=<<___	if ($i==8);
-	sar	\$63, %rbp		# sign extension
-	mov	%rbp, 8*5($out_ptr)
-	mov	%rbp, 8*6($out_ptr)
-	mov	%rbp, 8*7($out_ptr)
-___
-}
-$code.=<<___;
-	################################# two[!] last iterations in one go
-	xor	\$256+8*8, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$47, $cnt		# 31 + 512 % 31
-	#call	__ab_approximation_31	# |a| and |b| are exact, just load
-	mov	8*0($in_ptr), @acc[0]	# |a_lo|
-	#xor	@acc[1],      @acc[1]	# |a_hi|
-	mov	8*4($in_ptr), @acc[2]	# |b_lo|
-	#xor	@acc[3],      @acc[3]	# |b_hi|
-	call	__inner_loop_62_256
-	#mov	$f0, 8*0(%rsp)
-	#mov	$g0, 8*1(%rsp)
-	#mov	$f1, 8*2(%rsp)
-	#mov	$g1, 8*3(%rsp)
-
-	#mov	8*0(%rsp), $f0		# |f0|
-	#mov	8*1(%rsp), $g0		# |g0|
-	lea	8*8($in_ptr), $in_ptr	# pointer to source |u|v|
-	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
-	#call	__smulq_256x63
-
-	#mov	8*2(%rsp), $f0		# |f1|
-	#mov	8*3(%rsp), $g0		# |g1|
-	mov	$f1, $f0
-	mov	$g1, $g0
-	mov	8*4(%rsp), $out_ptr	# original |out_ptr|
-	call	__smulq_512x63
-	adc	%rbp, %rdx		# the excess limb of the result
-
-	mov	8*5(%rsp), $in_ptr	# original |nx_ptr|
-	mov	%rdx, %rax
-	sar	\$63, %rdx		# result's sign as mask
-
-	mov	%rdx, @acc[0]		# mask |modulus|
-	mov	%rdx, @acc[1]
-	and	8*0($in_ptr), @acc[0]
-	mov	%rdx, @acc[2]
-	and	8*1($in_ptr), @acc[1]
-	and	8*2($in_ptr), @acc[2]
-	and	8*3($in_ptr), %rdx
-
-	add	@acc[0], @acc[4]	# conditionally add |modulus|<<256
-	adc	@acc[1], @acc[5]
-	adc	@acc[2], @acc[6]
-	adc	%rdx,    @acc[7]
-	adc	\$0,     %rax
-
-	mov	%rax, %rdx
-	neg	%rax
-	or	%rax, %rdx		# excess bit or sign as mask
-	sar	\$63, %rax		# excess bit as mask
-
-	mov	%rdx, @acc[0]		# mask |modulus|
-	mov	%rdx, @acc[1]
-	and	8*0($in_ptr), @acc[0]
-	mov	%rdx, @acc[2]
-	and	8*1($in_ptr), @acc[1]
-	and	8*2($in_ptr), @acc[2]
-	and	8*3($in_ptr), %rdx
-
-	xor	%rax, @acc[0]		# conditionally negate |modulus|
-	xor	%rcx, %rcx
-	xor	%rax, @acc[1]
-	sub	%rax, %rcx
-	xor	%rax, @acc[2]
-	xor	%rax, %rdx
-	add	%rcx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, %rdx
-
-	add	@acc[0], @acc[4]	# final adjustment for |modulus|<<256
-	adc	@acc[1], @acc[5]
-	adc	@acc[2], @acc[6]
-	adc	%rdx,    @acc[7]
-
-	mov	@acc[4], 8*4($out_ptr)	# store absolute value
-	mov	@acc[5], 8*5($out_ptr)
-	mov	@acc[6], 8*6($out_ptr)
-	mov	@acc[7], 8*7($out_ptr)
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	ct_inverse_mod_256,.-ct_inverse_mod_256
-___
-########################################################################
-# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
-# to the maximum bit-length of the *result*, and "63" - to the maximum
-# bit-length of the |f?| and |g?| single-limb multiplicands. However!
-# The latter should not be taken literally, as they are always chosen so
-# that "bad things" don't happen. For example, there comes a point when
-# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
-# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
-# because past that point |f0| is always 1 and |g0| is always 0. And,
-# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
-# perform full-width |u|*|f1| multiplication, half-width one with sign
-# extension is sufficient...
-$code.=<<___;
-.type	__smulq_512x63,\@abi-omnipotent
-.align	32
-__smulq_512x63:
-	mov	8*0($in_ptr), @acc[0]	# load |u|
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), %rbp	# sign limb
-
-	mov	$f0, %rbx
-	sar	\$63, $f0		# |f0|'s sign as mask
-	xor	%rax, %rax
-	sub	$f0, %rax		# |f0|'s sign as bit
-
-	xor	$f0, %rbx		# conditionally negate |f0|
-	add	%rax, %rbx
-
-	xor	$f0, @acc[0]		# conditionally negate |u|
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	xor	$f0, %rbp
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, %rbp
-
-	mulq	%rbx			# |u|*|f0|
-	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<3; $i++) {
-$code.=<<___;
-	mulq	%rbx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	@acc[$i], 8*$i($out_ptr)
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	and	%rbx, %rbp
-	neg	%rbp
-	mulq	%rbx
-	add	%rax, @acc[3]
-	adc	%rdx, %rbp
-	mov	@acc[3], 8*3($out_ptr)
-
-	mov	8*5($in_ptr), @acc[0]	# load |v|
-	mov	8*6($in_ptr), @acc[1]
-	mov	8*7($in_ptr), @acc[2]
-	mov	8*8($in_ptr), @acc[3]
-	mov	8*9($in_ptr), @acc[4]
-	mov	8*10($in_ptr), @acc[5]
-	mov	8*11($in_ptr), @acc[6]
-	mov	8*12($in_ptr), @acc[7]
-
-	mov	$g0, $f0
-	sar	\$63, $f0		# |g0|'s sign as mask
-	xor	%rax, %rax
-	sub	$f0, %rax		# |g0|'s sign as bit
-
-	xor	$f0, $g0		# conditionally negate |g0|
-	add	%rax, $g0
-
-	xor	$f0, @acc[0]		# conditionally negate |v|
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	xor	$f0, @acc[4]
-	xor	$f0, @acc[5]
-	xor	$f0, @acc[6]
-	xor	$f0, @acc[7]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-	adc	\$0, @acc[6]
-	adc	\$0, @acc[7]
-
-	mulq	$g0
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<7; $i++) {
-$code.=<<___;
-	mulq	$g0
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	imulq	$g0
-	add	%rax, @acc[7]
-	adc	\$0, %rdx		# used in the final step
-
-	mov	%rbp, %rbx
-	sar	\$63, %rbp		# sign extension
-
-	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	%rbx, @acc[4]
-	adc	%rbp, @acc[5]
-	adc	%rbp, @acc[6]
-	adc	%rbp, @acc[7]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-	mov	@acc[6], 8*6($out_ptr)
-	mov	@acc[7], 8*7($out_ptr)
-
-	ret
-.size	__smulq_512x63,.-__smulq_512x63
-
-.type	__smulq_256x63,\@abi-omnipotent
-.align	32
-__smulq_256x63:
-___
-for($j=0; $j<2; $j++) {
-my $k = 8*5*$j;
-my @acc=@acc;	@acc=@acc[4..7]	if($j);
-my $top="%rbp";	$top=$g0	if($j);
-$code.=<<___;
-	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
-	mov	$k+8*1($in_ptr), @acc[1]
-	mov	$k+8*2($in_ptr), @acc[2]
-	mov	$k+8*3($in_ptr), @acc[3]
-	mov	$k+8*4($in_ptr), $top	# sign/excess limb
-
-	mov	$f0, %rbx
-	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
-	xor	%rax, %rax
-	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	$f0, %rbx		# conditionally negate |f0|
-	add	%rax, %rbx
-
-	xor	$f0, @acc[0]		# conditionally negate |u| (or |v|)
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	xor	$f0, $top
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, $top
-
-	mulq	%rbx
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<3; $i++) {
-$code.=<<___;
-	mulq	%rbx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	and	%rbx, $top
-	neg	$top
-	mulq	%rbx
-	add	%rax, @acc[3]
-	adc	%rdx, $top
-___
-$code.=<<___	if ($j==0);
-	mov	$g0, $f0
-___
-}
-$code.=<<___;
-	add	@acc[4], @acc[0]	# accumulate |u|*|f0|
-	adc	@acc[5], @acc[1]
-	adc	@acc[6], @acc[2]
-	adc	@acc[7], @acc[3]
-	adc	%rcx, %rbp
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	%rbp,    8*4($out_ptr)
-
-	ret
-.size	__smulq_256x63,.-__smulq_256x63
-___
-########################################################################
-# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
-# the names refers to maximum bit-lengths of |a| and |b|. As already
-# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
-# chosen so that "bad things" don't happen. For example, so that the
-# sum of the products doesn't overflow, and that the final result is
-# never wider than inputs...
-{
-$code.=<<___;
-.type	__smulq_256_n_shift_by_31,\@abi-omnipotent
-.align	32
-__smulq_256_n_shift_by_31:
-	mov	$f0, 8*0($out_ptr)	# offload |f0|
-	mov	$g0, 8*1($out_ptr)	# offload |g0|
-	mov	$f0, %rbp
-___
-for($j=0; $j<2; $j++) {
-my $k = 8*4*$j;
-my @acc=@acc;	@acc=@acc[4..7] if ($j);
-my $f0="%rbp";	$f0=$g0		if ($j);
-$code.=<<___;
-	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
-	mov	$k+8*1($in_ptr), @acc[1]
-	mov	$k+8*2($in_ptr), @acc[2]
-	mov	$k+8*3($in_ptr), @acc[3]
-
-	mov	$f0, %rbx
-	sar	\$63, $f0		# |f0|'s sign as mask (or |g0|'s)
-	xor	%rax, %rax
-	sub	$f0, %rax		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	$f0, %rbx		# conditionally negate |f0| (or |g0|)
-	add	%rax, %rbx
-
-	xor	$f0, @acc[0]		# conditionally negate |a| (or |b|)
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-
-	mulq	%rbx
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	and	%rbx, $f0
-	neg	$f0
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<3; $i++) {
-$code.=<<___;
-	mulq	%rbx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	mulq	%rbx
-	add	%rax, @acc[3]
-	adc	%rdx, $f0
-___
-}
-$code.=<<___;
-	add	@acc[4], @acc[0]
-	adc	@acc[5], @acc[1]
-	adc	@acc[6], @acc[2]
-	adc	@acc[7], @acc[3]
-	adc	$g0, %rbp
-
-	mov	8*0($out_ptr), $f0	# restore original |f0|
-	mov	8*1($out_ptr), $g0	# restore original |g0|
-
-	shrd	\$31, @acc[1], @acc[0]
-	shrd	\$31, @acc[2], @acc[1]
-	shrd	\$31, @acc[3], @acc[2]
-	shrd	\$31, %rbp,    @acc[3]
-
-	sar	\$63, %rbp		# sign as mask
-	xor	%rax, %rax
-	sub	%rbp, %rax		# sign as bit
-
-	xor	%rbp, @acc[0]		# conditionally negate the result
-	xor	%rbp, @acc[1]
-	xor	%rbp, @acc[2]
-	xor	%rbp, @acc[3]
-	add	%rax, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-
-	xor	%rbp, $f0		# conditionally negate |f0|
-	xor	%rbp, $g0		# conditionally negate |g0|
-	add	%rax, $f0
-	add	%rax, $g0
-
-	ret
-.size	__smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31
-___
-}
-
-{
-my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
-my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
-my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
-my ($a_, $b_) = ($a_lo, $b_lo);
-{
-my @a = ($a_lo, $t1, $a_hi);
-my @b = ($b_lo, $t2, $b_hi);
-
-$code.=<<___;
-.type	__ab_approximation_31_256,\@abi-omnipotent
-.align	32
-__ab_approximation_31_256:
-	mov	8*3($in_ptr), @a[2]	# load |a| in reverse order
-	mov	8*7($in_ptr), @b[2]	# load |b| in reverse order
-	mov	8*2($in_ptr), @a[1]
-	mov	8*6($in_ptr), @b[1]
-	mov	8*1($in_ptr), @a[0]
-	mov	8*5($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# check top-most limbs, ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	mov	8*0($in_ptr), @a[0]
-	cmovz	@b[0], @b[1]
-	mov	8*4($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... and ones before that ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	cmovz	@b[0], @b[1]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0
-	bsr	$t0, %rcx
-	lea	1(%rcx), %rcx
-	cmovz	@a[0], @a[2]
-	cmovz	@b[0], @b[2]
-	cmovz	$t0, %rcx
-	neg	%rcx
-	#and	\$63, %rcx		# debugging artefact
-
-	shldq	%cl, @a[1], @a[2]	# align second limb to the left
-	shldq	%cl, @b[1], @b[2]
-
-	mov	\$0x7FFFFFFF, %eax
-	and	%rax, @a[0]
-	and	%rax, @b[0]
-	not	%rax
-	and	%rax, @a[2]
-	and	%rax, @b[2]
-	or	@a[2], @a[0]
-	or	@b[2], @b[0]
-
-	jmp	__inner_loop_31_256
-
-	ret
-.size	__ab_approximation_31_256,.-__ab_approximation_31_256
-___
-}
-$code.=<<___;
-.type	__inner_loop_31_256,\@abi-omnipotent
-.align	32			# comment and punish Coffee Lake by up to 40%
-__inner_loop_31_256:		################# by Thomas Pornin
-	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
-	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
-	mov	\$0x7FFFFFFF7FFFFFFF, $bias
-
-.Loop_31_256:
-	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
-	mov	$a_, $t0
-	mov	$b_, $t1
-	mov	$fg0, $t2
-	mov	$fg1, $t3
-	cmovb	$b_, $a_
-	cmovb	$t0, $b_
-	cmovb	$fg1, $fg0
-	cmovb	$t2, $fg1
-
-	sub	$b_, $a_		# |a_|-|b_|
-	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
-	add	$bias, $fg0
-
-	test	\$1, $t0		# if |a_| was even, roll back 
-	cmovz	$t0, $a_
-	cmovz	$t1, $b_
-	cmovz	$t2, $fg0
-	cmovz	$t3, $fg1
-
-	shr	\$1, $a_		# |a_|>>=1
-	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
-	sub	$bias, $fg1
-	sub	\$1, $cnt
-	jnz	.Loop_31_256
-
-	shr	\$32, $bias
-	mov	%ecx, %edx		# $fg0, $f0
-	mov	${fg1}d, ${f1}d
-	shr	\$32, $g0
-	shr	\$32, $g1
-	sub	$bias, $f0		# remove the bias
-	sub	$bias, $g0
-	sub	$bias, $f1
-	sub	$bias, $g1
-
-	ret
-.size	__inner_loop_31_256,.-__inner_loop_31_256
-
-.type	__inner_loop_62_256,\@abi-omnipotent
-.align	32
-__inner_loop_62_256:
-	mov	$cnt, %r15d
-	mov	\$1, $f0	# |f0|=1
-	xor	$g0, $g0	# |g0|=0
-	xor	$f1, $f1	# |f1|=0
-	mov	$f0, $g1	# |g1|=1
-	mov	$f0, %r14
-
-.Loop_62_256:
-	xor	$t0, $t0
-	test	%r14, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
-	mov	$b_lo, $t1
-	cmovnz	$b_lo, $t0
-	sub	$a_lo, $t1	# |b_|-|a_|
-	mov	$a_lo, $t2
-	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
-	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
-	cmovc	$t2, $b_lo	# |b_| = |a_|
-	mov	$f0, $t0	# exchange |f0| and |f1|
-	cmovc	$f1, $f0
-	cmovc	$t0, $f1
-	mov	$g0, $t1	# exchange |g0| and |g1|
-	cmovc	$g1, $g0
-	cmovc	$t1, $g1
-	xor	$t0, $t0
-	xor	$t1, $t1
-	shr	\$1, $a_lo
-	test	%r14, $t2	# if |a_| was odd, then we'll be subtracting...
-	cmovnz	$f1, $t0
-	cmovnz	$g1, $t1
-	add	$f1, $f1	# |f1|<<=1
-	add	$g1, $g1	# |g1|<<=1
-	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
-	sub	\$1, %r15d
-	jnz	.Loop_62_256
-
-	ret
-.size	__inner_loop_62_256,.-__inner_loop_62_256
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
deleted file mode 100755
index 268bf9d2546..00000000000
--- a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl
+++ /dev/null
@@ -1,610 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast Euclidean inversion as suggested in
-# https://eprint.iacr.org/2020/972. Performance is >12x better [on
-# Cortex cores] than modulus-specific FLT addition chain...
-#
-# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
-#
-$python_ref.=<<'___';
-def ct_inverse_mod_383(inp, mod):
-    a, u = inp, 1
-    b, v = mod, 0
-
-    k = 62
-    w = 64
-    mask = (1 << w) - 1
-
-    for i in range(0, 766 // k):
-        # __ab_approximation_62
-        n = max(a.bit_length(), b.bit_length())
-        if n < 128:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-w)) << w)
-            b_ = (b & mask) | ((b >> (n-w)) << w)
-
-        # __inner_loop_62
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-
-        # __smul_383_n_shift_by_62
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if a < 0:
-            a, f0, g0 = -a, -f0, -g0
-        if b < 0:
-            b, f1, g1 = -b, -f1, -g1
-
-        # __smul_767x63
-        u, v = u*f0 + v*g0, u*f1 + v*g1
-
-    if 766 % k:
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, 766 % k):
-            if a & 1:
-                if a < b:
-                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
-                a, f0, g0 = a-b, f0-f1, g0-g1
-            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
-
-        v = u*f1 + v*g1
-
-    if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
-
-    return v & (2**768 - 1) # to be reduced % mod
-___
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3));
-my @acc=map("x$_",(3..14));
-my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21));
-my $cnt = $n_ptr;
-my @t = map("x$_",(22..28,2));
-my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11];
-
-$frame = 16+2*512;
-
-$code.=<<___;
-.text
-
-.globl	ct_inverse_mod_383
-.type	ct_inverse_mod_383, %function
-.align	5
-ct_inverse_mod_383:
-	paciasp
-	stp	x29, x30, [sp,#-128]!
-	add	x29, sp, #0
-	stp	x19, x20, [sp,#16]
-	stp	x21, x22, [sp,#32]
-	stp	x23, x24, [sp,#48]
-	stp	x25, x26, [sp,#64]
-	stp	x27, x28, [sp,#80]
-	sub	sp, sp, #$frame
-
-	ldp	@t[0],   @acc[1], [$in_ptr,#8*0]
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2]
-	ldp	@acc[4], @acc[5], [$in_ptr,#8*4]
-
-	add	$in_ptr, sp, #16+511	// find closest 512-byte-aligned spot
-	and	$in_ptr, $in_ptr, #-512	// in the frame...
-	stp	$out_ptr, $nx_ptr, [sp]
-
-	ldp	@acc[6], @acc[7], [$n_ptr,#8*0]
-	ldp	@acc[8], @acc[9], [$n_ptr,#8*2]
-	ldp	@acc[10], @acc[11], [$n_ptr,#8*4]
-
-	stp	@t[0],   @acc[1], [$in_ptr,#8*0]	// copy input to |a|
-	stp	@acc[2], @acc[3], [$in_ptr,#8*2]
-	stp	@acc[4], @acc[5], [$in_ptr,#8*4]
-	stp	@acc[6], @acc[7], [$in_ptr,#8*6]	// copy modulus to |b|
-	stp	@acc[8], @acc[9], [$in_ptr,#8*8]
-	stp	@acc[10], @acc[11], [$in_ptr,#8*10]
-
-	////////////////////////////////////////// first iteration
-	mov	$cnt, #62
-	bl	.Lab_approximation_62_loaded
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_383_n_shift_by_62
-	str	$f0,[$out_ptr,#8*12]		// initialize |u| with |f0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |b|
-	bl	__smul_383_n_shift_by_62
-	str	$f0, [$out_ptr,#8*12]		// initialize |v| with |f1|
-
-	////////////////////////////////////////// second iteration
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	mov	$cnt, #62
-	bl	__ab_approximation_62
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_383_n_shift_by_62
-	mov	$f_, $f0			// corrected |f0|
-	mov	$g_, $g0			// corrected |g0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
-	bl	__smul_383_n_shift_by_62
-
-	ldr	@acc[4], [$in_ptr,#8*12]	// |u|
-	ldr	@acc[5], [$in_ptr,#8*18]	// |v|
-	mul	@acc[0], $f_, @acc[4]		// |u|*|f0|
-	smulh	@acc[1], $f_, @acc[4]
-	mul	@acc[2], $g_, @acc[5]		// |v|*|g0|
-	smulh	@acc[3], $g_, @acc[5]
-	adds	@acc[0], @acc[0], @acc[2]
-	adc	@acc[1], @acc[1], @acc[3]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
-	asr	@acc[2], @acc[1], #63		// sign extenstion
-	stp	@acc[2], @acc[2], [$out_ptr,#8*8]
-	stp	@acc[2], @acc[2], [$out_ptr,#8*10]
-
-	mul	@acc[0], $f0, @acc[4]		// |u|*|f1|
-	smulh	@acc[1], $f0, @acc[4]
-	mul	@acc[2], $g0, @acc[5]		// |v|*|g1|
-	smulh	@acc[3], $g0, @acc[5]
-	adds	@acc[0], @acc[0], @acc[2]
-	adc	@acc[1], @acc[1], @acc[3]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*12]
-	asr	@acc[2], @acc[1], #63		// sign extenstion
-	stp	@acc[2], @acc[2], [$out_ptr,#8*14]
-	stp	@acc[2], @acc[2], [$out_ptr,#8*16]
-___
-for($i=2; $i<11; $i++) {
-$code.=<<___;
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	mov	$cnt, #62
-	bl	__ab_approximation_62
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	bl	__smul_383_n_shift_by_62
-	mov	$f_, $f0			// corrected |f0|
-	mov	$g_, $g0			// corrected |g0|
-
-	mov	$f0, $f1			// |f1|
-	mov	$g0, $g1			// |g1|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |b|
-	bl	__smul_383_n_shift_by_62
-
-	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |u|
-	bl	__smul_383x63
-
-	mov	$f_, $f0			// corrected |f1|
-	mov	$g_, $g0			// corrected |g1|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to destination |v|
-	bl	__smul_383x63
-___
-$code.=<<___	if ($i>5);
-	bl	__smul_767x63_tail
-___
-$code.=<<___	if ($i==5);
-	asr	@t[5], @t[5], #63		// sign extension
-	stp	@t[5], @t[5], [$out_ptr,#8*6]
-	stp	@t[5], @t[5], [$out_ptr,#8*8]
-	stp	@t[5], @t[5], [$out_ptr,#8*10]
-___
-}
-$code.=<<___;
-	////////////////////////////////////////// iteration before last
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	mov	$cnt, #62
-	//bl	__ab_approximation_62		// |a| and |b| are exact,
-	ldp	$a_lo, $a_hi, [$in_ptr,#8*0]	// just load
-	ldp	$b_lo, $b_hi, [$in_ptr,#8*6]
-	bl	__inner_loop_62
-
-	eor	$out_ptr, $in_ptr, #256		// pointer to dst |a|b|u|v|
-	str	$a_lo, [$out_ptr,#8*0]
-	str	$b_lo, [$out_ptr,#8*6]
-
-	mov	$f_, $f0			// exact |f0|
-	mov	$g_, $g0			// exact |g0|
-	mov	$f0, $f1
-	mov	$g0, $g1
-	add	$out_ptr, $out_ptr, #8*12	// pointer to dst |u|
-	bl	__smul_383x63
-
-	mov	$f_, $f0			// exact |f1|
-	mov	$g_, $g0			// exact |g1|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |v|
-	bl	__smul_383x63
-	bl	__smul_767x63_tail
-
-	////////////////////////////////////////// last iteration
-	eor	$in_ptr, $in_ptr, #256		// flip-flop src |a|b|u|v|
-	mov	$cnt, #22			// 766 % 62
-	//bl	__ab_approximation_62		// |a| and |b| are exact,
-	ldr	$a_lo, [$in_ptr,#8*0]		// just load
-	eor	$a_hi, $a_hi, $a_hi
-	ldr	$b_lo, [$in_ptr,#8*6]
-	eor	$b_hi, $b_hi, $b_hi
-	bl	__inner_loop_62
-
-	mov	$f_, $f1
-	mov	$g_, $g1
-	ldp	$out_ptr, $f0, [sp]		// original out_ptr and n_ptr
-	bl	__smul_383x63
-	bl	__smul_767x63_tail
-	ldr	x30, [x29,#8]
-
-	asr	@t[0], @acc[5], #63		// sign as mask
-	ldp	@acc[6], @acc[7], [$f0,#8*0]
-	ldp	@acc[8], @acc[9], [$f0,#8*2]
-	ldp	@acc[10], @acc[11], [$f0,#8*4]
-
-	and	@acc[6], @acc[6], @t[0]		// add mod<<384 conditionally
-	and	@acc[7], @acc[7], @t[0]
-	adds	@acc[0], @acc[0], @acc[6]
-	and	@acc[8], @acc[8], @t[0]
-	adcs	@acc[1], @acc[1], @acc[7]
-	and	@acc[9], @acc[9], @t[0]
-	adcs	@acc[2], @acc[2], @acc[8]
-	and	@acc[10], @acc[10], @t[0]
-	adcs	@acc[3], @acc[3], @acc[9]
-	and	@acc[11], @acc[11], @t[0]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
-	adcs	@acc[4], @acc[4], @acc[10]
-	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
-	adc	@acc[5], @acc[5], @acc[11]
-	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
-
-	add	sp, sp, #$frame
-	ldp	x19, x20, [x29,#16]
-	ldp	x21, x22, [x29,#32]
-	ldp	x23, x24, [x29,#48]
-	ldp	x25, x26, [x29,#64]
-	ldp	x27, x28, [x29,#80]
-	ldr	x29, [sp],#128
-	autiasp
-	ret
-.size	ct_inverse_mod_383,.-ct_inverse_mod_383
-
-////////////////////////////////////////////////////////////////////////
-// see corresponding commentary in ctx_inverse_mod_384-x86_64...
-.type	__smul_383x63, %function
-.align	5
-__smul_383x63:
-___
-for($j=0; $j<2; $j++) {
-my $f_ = $f_;   $f_ = $g_          if ($j);
-my @acc = @acc; @acc = @acc[6..11] if ($j);
-my $k = 8*12+8*6*$j;
-$code.=<<___;
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |u| (or |v|)
-	asr	$f1, $f_, #63		// |f_|'s sign as mask (or |g_|'s)
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
-	eor	$f_, $f_, $f1		// conditionally negate |f_| (or |g_|)
-	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
-
-	eor	@acc[0], @acc[0], $f1	// conditionally negate |u| (or |v|)
-	sub	$f_, $f_, $f1
-	eor	@acc[1], @acc[1], $f1
-	adds	@acc[0], @acc[0], $f1, lsr#63
-	eor	@acc[2], @acc[2], $f1
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], $f1
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[4], @acc[4], $f1
-	adcs	@acc[3], @acc[3], xzr
-	 umulh	@t[0], @acc[0], $f_
-	eor	@acc[5], @acc[5], $f1
-	 umulh	@t[1], @acc[1], $f_
-	adcs	@acc[4], @acc[4], xzr
-	 umulh	@t[2], @acc[2], $f_
-	adcs	@acc[5], @acc[5], xzr
-	 umulh	@t[3], @acc[3], $f_
-___
-$code.=<<___	if ($j);
-	adc	$g1, xzr, xzr		// used in __smul_767x63_tail
-___
-$code.=<<___;
-	umulh	@t[4], @acc[4], $f_
-	mul	@acc[0], @acc[0], $f_
-	mul	@acc[1], @acc[1], $f_
-	mul	@acc[2], @acc[2], $f_
-	adds	@acc[1], @acc[1], @t[0]
-	mul	@acc[3], @acc[3], $f_
-	adcs	@acc[2], @acc[2], @t[1]
-	mul	@acc[4], @acc[4], $f_
-	adcs	@acc[3], @acc[3], @t[2]
-	mul	@t[5+$j],@acc[5], $f_
-	adcs	@acc[4], @acc[4], @t[3]
-	adcs	@t[5+$j],@t[5+$j],@t[4]
-___
-$code.=<<___	if ($j==0);
-	adc	@t[7], xzr, xzr
-___
-}
-$code.=<<___;
-	adc	@t[7], @t[7], xzr
-
-	adds	@acc[0], @acc[0], @acc[6]
-	adcs	@acc[1], @acc[1], @acc[7]
-	adcs	@acc[2], @acc[2], @acc[8]
-	adcs	@acc[3], @acc[3], @acc[9]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
-	adcs	@acc[4], @acc[4], @acc[10]
-	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
-	adcs	@t[5],   @t[5],   @t[6]
-	stp	@acc[4], @t[5],   [$out_ptr,#8*4]
-	adc	@t[6],   @t[7],   xzr	// used in __smul_767x63_tail
-
-	ret
-.size	__smul_383x63,.-__smul_383x63
-
-.type	__smul_767x63_tail, %function
-.align	5
-__smul_767x63_tail:
-	smulh	@t[5],   @acc[5], $f_
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*24]	// load rest of |v|
-	umulh	@acc[11],@acc[11], $g_
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*26]
-	ldp	@acc[4], @acc[5], [$in_ptr,#8*28]
-
-	eor	@acc[0], @acc[0], $f1	// conditionally negate rest of |v|
-	eor	@acc[1], @acc[1], $f1
-	eor	@acc[2], @acc[2], $f1
-	adds	@acc[0], @acc[0], $g1
-	eor	@acc[3], @acc[3], $f1
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[4], @acc[4], $f1
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[5], @acc[5], $f1
-	adcs	@acc[3], @acc[3], xzr
-	 umulh	@t[0], @acc[0], $g_
-	adcs	@acc[4], @acc[4], xzr
-	 umulh	@t[1], @acc[1], $g_
-	adc	@acc[5], @acc[5], xzr
-
-	umulh	@t[2], @acc[2], $g_
-	 add	@acc[11], @acc[11], @t[6]
-	umulh	@t[3], @acc[3], $g_
-	 asr	@t[6], @t[5], #63
-	umulh	@t[4], @acc[4], $g_
-	mul	@acc[0], @acc[0], $g_
-	mul	@acc[1], @acc[1], $g_
-	mul	@acc[2], @acc[2], $g_
-	adds	@acc[0], @acc[0], @acc[11]
-	mul	@acc[3], @acc[3], $g_
-	adcs	@acc[1], @acc[1], @t[0]
-	mul	@acc[4], @acc[4], $g_
-	adcs	@acc[2], @acc[2], @t[1]
-	mul	@acc[5], @acc[5], $g_
-	adcs	@acc[3], @acc[3], @t[2]
-	adcs	@acc[4], @acc[4], @t[3]
-	adc	@acc[5], @acc[5], @t[4]
-
-	adds	@acc[0], @acc[0], @t[5]
-	adcs	@acc[1], @acc[1], @t[6]
-	adcs	@acc[2], @acc[2], @t[6]
-	adcs	@acc[3], @acc[3], @t[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*6]
-	adcs	@acc[4], @acc[4], @t[6]
-	stp	@acc[2], @acc[3], [$out_ptr,#8*8]
-	adc	@acc[5], @acc[5], @t[6]
-	stp	@acc[4], @acc[5], [$out_ptr,#8*10]
-
-	ret
-.size	__smul_767x63_tail,.-__smul_767x63_tail
-
-.type	__smul_383_n_shift_by_62, %function
-.align	5
-__smul_383_n_shift_by_62:
-___
-for($j=0; $j<2; $j++) {
-my $f0 = $f0;   $f0 = $g0           if ($j);
-my @acc = @acc; @acc = @acc[6..11]  if ($j);
-my $k = 8*6*$j;
-$code.=<<___;
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |a| (or |b|)
-	asr	@t[6], $f0, #63		// |f0|'s sign as mask (or |g0|'s)
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
-	eor	@t[7], $f0, @t[6]	// conditionally negate |f0| (or |g0|)
-	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
-
-	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |a| (or |b|)
-	sub	@t[7], @t[7], @t[6]
-	eor	@acc[1], @acc[1], @t[6]
-	adds	@acc[0], @acc[0], @t[6], lsr#63
-	eor	@acc[2], @acc[2], @t[6]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[6]
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[4], @acc[4], @t[6]
-	 umulh	@t[0], @acc[0], @t[7]
-	adcs	@acc[3], @acc[3], xzr
-	 umulh	@t[1], @acc[1], @t[7]
-	eor	@acc[5], @acc[5], @t[6]
-	 umulh	@t[2], @acc[2], @t[7]
-	adcs	@acc[4], @acc[4], xzr
-	 umulh	@t[3], @acc[3], @t[7]
-	adc	@acc[5], @acc[5], xzr
-
-	umulh	@t[4], @acc[4], @t[7]
-	smulh	@t[5+$j], @acc[5], @t[7]
-	mul	@acc[0], @acc[0], @t[7]
-	mul	@acc[1], @acc[1], @t[7]
-	mul	@acc[2], @acc[2], @t[7]
-	adds	@acc[1], @acc[1], @t[0]
-	mul	@acc[3], @acc[3], @t[7]
-	adcs	@acc[2], @acc[2], @t[1]
-	mul	@acc[4], @acc[4], @t[7]
-	adcs	@acc[3], @acc[3], @t[2]
-	mul	@acc[5], @acc[5], @t[7]
-	adcs	@acc[4], @acc[4], @t[3]
-	adcs	@acc[5], @acc[5] ,@t[4]
-	adc	@t[5+$j], @t[5+$j], xzr
-___
-}
-$code.=<<___;
-	adds	@acc[0], @acc[0], @acc[6]
-	adcs	@acc[1], @acc[1], @acc[7]
-	adcs	@acc[2], @acc[2], @acc[8]
-	adcs	@acc[3], @acc[3], @acc[9]
-	adcs	@acc[4], @acc[4], @acc[10]
-	adcs	@acc[5], @acc[5], @acc[11]
-	adc	@acc[6], @t[5],   @t[6]
-
-	extr	@acc[0], @acc[1], @acc[0], #62
-	extr	@acc[1], @acc[2], @acc[1], #62
-	extr	@acc[2], @acc[3], @acc[2], #62
-	asr	@t[6], @acc[6], #63
-	extr	@acc[3], @acc[4], @acc[3], #62
-	extr	@acc[4], @acc[5], @acc[4], #62
-	extr	@acc[5], @acc[6], @acc[5], #62
-
-	eor	@acc[0], @acc[0], @t[6]
-	eor	@acc[1], @acc[1], @t[6]
-	adds	@acc[0], @acc[0], @t[6], lsr#63
-	eor	@acc[2], @acc[2], @t[6]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[6]
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[4], @acc[4], @t[6]
-	adcs	@acc[3], @acc[3], xzr
-	eor	@acc[5], @acc[5], @t[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
-	adcs	@acc[4], @acc[4], xzr
-	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
-	adc	@acc[5], @acc[5], xzr
-	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
-
-	eor	$f0, $f0, @t[6]
-	eor	$g0, $g0, @t[6]
-	sub	$f0, $f0, @t[6]
-	sub	$g0, $g0, @t[6]
-
-	ret
-.size	__smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62
-___
-
-{
-my @a = @acc[0..5];
-my @b = @acc[6..11];
-
-$code.=<<___;
-.type	__ab_approximation_62, %function
-.align	4
-__ab_approximation_62:
-	ldp	@a[4], @a[5], [$in_ptr,#8*4]
-	ldp	@b[4], @b[5], [$in_ptr,#8*10]
-	ldp	@a[2], @a[3], [$in_ptr,#8*2]
-	ldp	@b[2], @b[3], [$in_ptr,#8*8]
-
-.Lab_approximation_62_loaded:
-	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[3], ne
-	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
-	csel	@b[4], @b[4], @b[3], ne
-
-	ldp	@a[0], @a[1], [$in_ptr,#8*0]
-	ldp	@b[0], @b[1], [$in_ptr,#8*6]
-
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[2], ne
-	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
-	csel	@b[4], @b[4], @b[2], ne
-
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[1], ne
-	orr	@t[0], @a[5], @b[5]
-	csel	@b[4], @b[4], @b[1], ne
-
-	clz	@t[0], @t[0]
-	cmp	@t[0], #64
-	csel	@t[0], @t[0], xzr, ne
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	neg	@t[1], @t[0]
-
-	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
-	lslv	@b[5], @b[5], @t[0]
-	lsrv	@a[4], @a[4], @t[1]
-	lsrv	@b[4], @b[4], @t[1]
-	and	@a[4], @a[4], @t[1], asr#6
-	and	@b[4], @b[4], @t[1], asr#6
-	orr	@a[5], @a[5], @a[4]
-	orr	@b[5], @b[5], @b[4]
-
-	b	__inner_loop_62
-	ret
-.size	__ab_approximation_62,.-__ab_approximation_62
-___
-}
-$code.=<<___;
-.type	__inner_loop_62, %function
-.align	4
-__inner_loop_62:
-	mov	$f0, #1		// |f0|=1
-	mov	$g0, #0		// |g0|=0
-	mov	$f1, #0		// |f1|=0
-	mov	$g1, #1		// |g1|=1
-
-.Loop_62:
-	sbfx	@t[6], $a_lo, #0, #1	// if |a_| is odd, then we'll be subtracting
-	sub	$cnt, $cnt, #1
-	subs	@t[2], $b_lo, $a_lo	// |b_|-|a_|
-	and	@t[0], $b_lo, @t[6]
-	sbc	@t[3], $b_hi, $a_hi
-	and	@t[1], $b_hi, @t[6]
-	subs	@t[4], $a_lo, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
-	mov	@t[0], $f0
-	sbcs	@t[5], $a_hi, @t[1]
-	mov	@t[1], $g0
-	csel	$b_lo, $b_lo, $a_lo, hs	// |b_| = |a_|
-	csel	$b_hi, $b_hi, $a_hi, hs
-	csel	$a_lo, @t[4], @t[2], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
-	csel	$a_hi, @t[5], @t[3], hs
-	csel	$f0, $f0, $f1,       hs	// exchange |f0| and |f1|
-	csel	$f1, $f1, @t[0],     hs
-	csel	$g0, $g0, $g1,       hs	// exchange |g0| and |g1|
-	csel	$g1, $g1, @t[1],     hs
-	extr	$a_lo, $a_hi, $a_lo, #1
-	lsr	$a_hi, $a_hi, #1
-	and	@t[0], $f1, @t[6]
-	and	@t[1], $g1, @t[6]
-	add	$f1, $f1, $f1		// |f1|<<=1
-	add	$g1, $g1, $g1		// |g1|<<=1
-	sub	$f0, $f0, @t[0]		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	sub	$g0, $g0, @t[1]		// |g0|-=|g1| (or |g0-=0| ...)
-	cbnz	$cnt, .Loop_62
-
-	ret
-.size	__inner_loop_62,.-__inner_loop_62
-___
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
deleted file mode 100755
index 4128dc3236d..00000000000
--- a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl
+++ /dev/null
@@ -1,401 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast quadratic residue test as suggested in
-# https://eprint.iacr.org/2020/972. Performance is >12x better [on
-# Cortex cores] than modulus-specific Legendre symbol addition chain...
-#
-# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
-#
-$python_ref.=<<'___';
-def ct_is_square_mod_384(inp, mod):
-    a = inp
-    b = mod
-    L = 0   # only least significant bit, adding 1 makes up for sign change
-
-    k = 30
-    w = 32
-    mask = (1 << w) - 1
-
-    for i in range(0, 768 // k - 1):
-        # __ab_approximation_30
-        n = max(a.bit_length(), b.bit_length())
-        if n < 64:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-w)) << w)
-            b_ = (b & mask) | ((b >> (n-w)) << w)
-
-        # __inner_loop_30
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
-                                        # tell the whole story
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
-
-        # __smulq_384_n_shift_by_30
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if b < 0:
-            b = -b
-        if a < 0:
-            a = -a
-            L += (b % 4) >> 1           # |b| is always odd, the second bit
-                                        # tells the whole story
-
-    if True:
-        for j in range(0, 768 % k + k):
-            if a & 1:
-                if a < b:
-                    a, b = b, a
-                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
-                                        # tell the whole story
-                a = a-b
-            a = a >> 1
-            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
-
-    return (L & 1) ^ 1
-___
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2));
-my @acc=map("x$_",(3..14));
-my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20));
-my @t = map("x$_",(21..28));
-my ($a_, $b_) = @acc[5,11];
-
-$frame = 2*256;
-
-$code.=<<___;
-.text
-
-.globl	ct_is_square_mod_384
-.type	ct_is_square_mod_384, %function
-.align	5
-ct_is_square_mod_384:
-	paciasp
-	stp	x29, x30, [sp,#-128]!
-	add	x29, sp, #0
-	stp	x19, x20, [sp,#16]
-	stp	x21, x22, [sp,#32]
-	stp	x23, x24, [sp,#48]
-	stp	x25, x26, [sp,#64]
-	stp	x27, x28, [sp,#80]
-	sub	sp, sp, #$frame
-
-	ldp	@acc[0], @acc[1], [x0,#8*0]		// load input
-	ldp	@acc[2], @acc[3], [x0,#8*2]
-	ldp	@acc[4], @acc[5], [x0,#8*4]
-
-	add	$in_ptr, sp, #255	// find closest 256-byte-aligned spot
-	and	$in_ptr, $in_ptr, #-256	// in the frame...
-
-	ldp	@acc[6], @acc[7], [x1,#8*0]		// load modulus
-	ldp	@acc[8], @acc[9], [x1,#8*2]
-	ldp	@acc[10], @acc[11], [x1,#8*4]
-
-	stp	@acc[0], @acc[1], [$in_ptr,#8*6]	// copy input to |a|
-	stp	@acc[2], @acc[3], [$in_ptr,#8*8]
-	stp	@acc[4], @acc[5], [$in_ptr,#8*10]
-	stp	@acc[6], @acc[7], [$in_ptr,#8*0]	// copy modulus to |b|
-	stp	@acc[8], @acc[9], [$in_ptr,#8*2]
-	stp	@acc[10], @acc[11], [$in_ptr,#8*4]
-
-	eor	$L, $L, $L			// init the Legendre symbol
-	mov	$cnt, #24			// 24 is 768/30-1
-	b	.Loop_is_square
-
-.align	4
-.Loop_is_square:
-	bl	__ab_approximation_30
-	sub	$cnt, $cnt, #1
-
-	eor	$out_ptr, $in_ptr, #128		// pointer to dst |b|
-	bl	__smul_384_n_shift_by_30
-
-	mov	$f1, $f0			// |f0|
-	mov	$g1, $g0			// |g0|
-	add	$out_ptr, $out_ptr, #8*6	// pointer to dst |a|
-	bl	__smul_384_n_shift_by_30
-
-	ldp	@acc[6], @acc[7], [$out_ptr,#-8*6]
-	eor	$in_ptr, $in_ptr, #128		// flip-flop src |a|b|
-	and	@t[6], @t[6], @acc[6]		// if |a| was negative,
-	add	$L, $L, @t[6], lsr#1		// adjust |L|
-
-	cbnz	$cnt, .Loop_is_square
-
-	////////////////////////////////////////// last iteration
-	//bl	__ab_approximation_30		// |a| and |b| are exact,
-	//ldr	$a_, [$in_ptr,#8*6]		// and loaded
-	//ldr	$b_, [$in_ptr,#8*0]
-	mov	$cnt, #48			// 48 is 768%30 + 30
-	bl	__inner_loop_48
-	ldr	x30, [x29,#8]
-
-	and	x0, $L, #1
-	eor	x0, x0, #1
-
-	add	sp, sp, #$frame
-	ldp	x19, x20, [x29,#16]
-	ldp	x21, x22, [x29,#32]
-	ldp	x23, x24, [x29,#48]
-	ldp	x25, x26, [x29,#64]
-	ldp	x27, x28, [x29,#80]
-	ldr	x29, [sp],#128
-	autiasp
-	ret
-.size	ct_is_square_mod_384,.-ct_is_square_mod_384
-
-.type	__smul_384_n_shift_by_30, %function
-.align	5
-__smul_384_n_shift_by_30:
-___
-for($j=0; $j<2; $j++) {
-my $fx = $g1;   $fx = $f1           if ($j);
-my @acc = @acc; @acc = @acc[6..11]  if ($j);
-my $k = 8*6*$j;
-$code.=<<___;
-	ldp	@acc[0], @acc[1], [$in_ptr,#8*0+$k]	// load |b| (or |a|)
-	asr	@t[6], $fx, #63		// |g1|'s sign as mask (or |f1|'s)
-	ldp	@acc[2], @acc[3], [$in_ptr,#8*2+$k]
-	eor	$fx, $fx, @t[6]		// conditionally negate |g1| (or |f1|)
-	ldp	@acc[4], @acc[5], [$in_ptr,#8*4+$k]
-
-	eor	@acc[0], @acc[0], @t[6]	// conditionally negate |b| (or |a|)
-	sub	$fx, $fx, @t[6]
-	eor	@acc[1], @acc[1], @t[6]
-	adds	@acc[0], @acc[0], @t[6], lsr#63
-	eor	@acc[2], @acc[2], @t[6]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[6]
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[4], @acc[4], @t[6]
-	 umulh	@t[0], @acc[0], $fx
-	adcs	@acc[3], @acc[3], xzr
-	 umulh	@t[1], @acc[1], $fx
-	eor	@acc[5], @acc[5], @t[6]
-	 umulh	@t[2], @acc[2], $fx
-	adcs	@acc[4], @acc[4], xzr
-	 umulh	@t[3], @acc[3], $fx
-	adc	@acc[5], @acc[5], xzr
-
-	umulh	@t[4], @acc[4], $fx
-	and	@t[7], $fx, @t[6]
-	umulh	@t[5+$j], @acc[5], $fx
-	neg	@t[7], @t[7]
-	mul	@acc[0], @acc[0], $fx
-	mul	@acc[1], @acc[1], $fx
-	mul	@acc[2], @acc[2], $fx
-	adds	@acc[1], @acc[1], @t[0]
-	mul	@acc[3], @acc[3], $fx
-	adcs	@acc[2], @acc[2], @t[1]
-	mul	@acc[4], @acc[4], $fx
-	adcs	@acc[3], @acc[3], @t[2]
-	mul	@acc[5], @acc[5], $fx
-	adcs	@acc[4], @acc[4], @t[3]
-	adcs	@acc[5], @acc[5] ,@t[4]
-	adc	@t[5+$j], @t[5+$j], @t[7]
-___
-}
-$code.=<<___;
-	adds	@acc[0], @acc[0], @acc[6]
-	adcs	@acc[1], @acc[1], @acc[7]
-	adcs	@acc[2], @acc[2], @acc[8]
-	adcs	@acc[3], @acc[3], @acc[9]
-	adcs	@acc[4], @acc[4], @acc[10]
-	adcs	@acc[5], @acc[5], @acc[11]
-	adc	@acc[6], @t[5],   @t[6]
-
-	extr	@acc[0], @acc[1], @acc[0], #30
-	extr	@acc[1], @acc[2], @acc[1], #30
-	extr	@acc[2], @acc[3], @acc[2], #30
-	asr	@t[6], @acc[6], #63
-	extr	@acc[3], @acc[4], @acc[3], #30
-	extr	@acc[4], @acc[5], @acc[4], #30
-	extr	@acc[5], @acc[6], @acc[5], #30
-
-	eor	@acc[0], @acc[0], @t[6]
-	eor	@acc[1], @acc[1], @t[6]
-	adds	@acc[0], @acc[0], @t[6], lsr#63
-	eor	@acc[2], @acc[2], @t[6]
-	adcs	@acc[1], @acc[1], xzr
-	eor	@acc[3], @acc[3], @t[6]
-	adcs	@acc[2], @acc[2], xzr
-	eor	@acc[4], @acc[4], @t[6]
-	adcs	@acc[3], @acc[3], xzr
-	eor	@acc[5], @acc[5], @t[6]
-	stp	@acc[0], @acc[1], [$out_ptr,#8*0]
-	adcs	@acc[4], @acc[4], xzr
-	stp	@acc[2], @acc[3], [$out_ptr,#8*2]
-	adc	@acc[5], @acc[5], xzr
-	stp	@acc[4], @acc[5], [$out_ptr,#8*4]
-
-	ret
-.size	__smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30
-___
-
-{
-my @a = @acc[0..5];
-my @b = @acc[6..11];
-my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]);
-
-$code.=<<___;
-.type	__ab_approximation_30, %function
-.align	4
-__ab_approximation_30:
-	ldp	@b[4], @b[5], [$in_ptr,#8*4]	// |a| is still in registers
-	ldp	@b[2], @b[3], [$in_ptr,#8*2]
-
-	orr	@t[0], @a[5], @b[5]	// check top-most limbs, ...
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[3], ne
-	orr	@t[0], @a[5], @b[5]	// ... ones before top-most, ...
-	csel	@b[4], @b[4], @b[3], ne
-
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[2], ne
-	orr	@t[0], @a[5], @b[5]	// ... and ones before that ...
-	csel	@b[4], @b[4], @b[2], ne
-
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[1], ne
-	orr	@t[0], @a[5], @b[5]	// and one more, ...
-	csel	@b[4], @b[4], @b[1], ne
-
-	cmp	@t[0], #0
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	csel	@a[4], @a[4], @a[0], ne
-	orr	@t[0], @a[5], @b[5]
-	csel	@b[4], @b[4], @b[0], ne
-
-	clz	@t[0], @t[0]
-	cmp	@t[0], #64
-	csel	@t[0], @t[0], xzr, ne
-	csel	@a[5], @a[5], @a[4], ne
-	csel	@b[5], @b[5], @b[4], ne
-	neg	@t[1], @t[0]
-
-	lslv	@a[5], @a[5], @t[0]	// align high limbs to the left
-	lslv	@b[5], @b[5], @t[0]
-	lsrv	@a[4], @a[4], @t[1]
-	lsrv	@b[4], @b[4], @t[1]
-	and	@a[4], @a[4], @t[1], asr#6
-	and	@b[4], @b[4], @t[1], asr#6
-	orr	$a_, @a[5], @a[4]
-	orr	$b_, @b[5], @b[4]
-
-	bfxil	$a_, @a[0], #0, #32
-	bfxil	$b_, @b[0], #0, #32
-
-	b	__inner_loop_30
-	ret
-.size	__ab_approximation_30,.-__ab_approximation_30
-
-.type	__inner_loop_30, %function
-.align	4
-__inner_loop_30:
-	mov	$cnt, #30
-	mov	$fg0, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
-	mov	$fg1, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
-	mov	$bias,#0x7FFFFFFF7FFFFFFF
-
-.Loop_30:
-	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
-	 and	@t[4], $a_, $b_
-	sub	$cnt, $cnt, #1
-	and	@t[0], $b_, @t[3]
-
-	sub	@t[1], $b_, $a_		// |b_|-|a_|
-	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
-	 add	@t[4], $L, @t[4], lsr#1	// L + (a_ & b_) >> 1
-	mov	@t[0], $fg1
-	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
-	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
-	csel	$fg1, $fg1, $fg0,  hs	// exchange |fg0| and |fg1|
-	csel	$fg0, $fg0, @t[0], hs
-	 csel	$L,   $L,   @t[4], hs
-	lsr	$a_, $a_, #1
-	and	@t[0], $fg1, @t[3]
-	and	@t[1], $bias, @t[3]
-	 add	$t[2], $b_, #2
-	sub	$fg0, $fg0, @t[0]	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	add	$fg1, $fg1, $fg1	// |f1|<<=1
-	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
-	add	$fg0, $fg0, @t[1]
-	sub	$fg1, $fg1, $bias
-
-	cbnz	$cnt, .Loop_30
-
-	mov	$bias, #0x7FFFFFFF
-	ubfx	$f0, $fg0, #0, #32
-	ubfx	$g0, $fg0, #32, #32
-	ubfx	$f1, $fg1, #0, #32
-	ubfx	$g1, $fg1, #32, #32
-	sub	$f0, $f0, $bias		// remove the bias
-	sub	$g0, $g0, $bias
-	sub	$f1, $f1, $bias
-	sub	$g1, $g1, $bias
-
-	ret
-.size	__inner_loop_30,.-__inner_loop_30
-___
-}
-
-{
-my ($a_, $b_) = (@acc[0], @acc[6]);
-$code.=<<___;
-.type	__inner_loop_48, %function
-.align	4
-__inner_loop_48:
-.Loop_48:
-	sbfx	@t[3], $a_, #0, #1	// if |a_| is odd, then we'll be subtracting
-	 and	@t[4], $a_, $b_
-	sub	$cnt, $cnt, #1
-	and	@t[0], $b_, @t[3]
-	sub	@t[1], $b_, $a_		// |b_|-|a_|
-	subs	@t[2], $a_, @t[0]	// |a_|-|b_| (or |a_|-0 if |a_| was even)
-	 add	@t[4], $L, @t[4], lsr#1
-	csel	$b_, $b_, $a_, hs	// |b_| = |a_|
-	csel	$a_, @t[2], @t[1], hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
-	 csel	$L,   $L,   @t[4], hs
-	 add	$t[2], $b_, #2
-	lsr	$a_, $a_, #1
-	 add	$L, $L, $t[2], lsr#2	// "negate" |L| if |b|%8 is 3 or 5
-
-	cbnz	$cnt, .Loop_48
-
-	ret
-.size	__inner_loop_48,.-__inner_loop_48
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
deleted file mode 100755
index 40016ed70d2..00000000000
--- a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl
+++ /dev/null
@@ -1,494 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast quadratic residue test as suggested in
-# https://eprint.iacr.org/2020/972. Performance is >5x better than
-# modulus-specific Legendre symbol addition chain...
-#
-# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod);
-#
-$python_ref.=<<'___';
-def ct_is_square_mod_384(inp, mod):
-    a = inp
-    b = mod
-    L = 0   # only least significant bit, adding 1 makes up for sign change
-
-    k = 30
-    w = 32
-    mask = (1 << w) - 1
-
-    for i in range(0, 768 // k - 1):
-        # __ab_approximation_30
-        n = max(a.bit_length(), b.bit_length())
-        if n < 64:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-w)) << w)
-            b_ = (b & mask) | ((b >> (n-w)) << w)
-
-        # __inner_loop_30
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                    L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits
-                                        # tell the whole story
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-            L += (b_ + 2) >> 2          # if |b|%8 is 3 or 5 [out of 1,3,5,7]
-
-        # __smulq_384_n_shift_by_30
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if b < 0:
-            b = -b
-        if a < 0:
-            a = -a
-            L += (b % 4) >> 1           # |b| is always odd, the second bit
-                                        # tells the whole story
-
-    if True:
-        for j in range(0, 768 % k + k):
-            if a & 1:
-                if a < b:
-                    a, b = b, a
-                    L += (a & b) >> 1   # |a| and |b| are both odd, second bits
-                                        # tell the whole story
-                a = a-b
-            a = a >> 1
-            L += (b + 2) >> 2           # if |b|%8 is 3 or 5 [out of 1,3,5,7]
-
-    return (L & 1) ^ 1
-___
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-my ($out_ptr, $in_ptr) = ("%rdi", "%rsi");
-my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx");
-my @acc=map("%r$_",(8..15));
-my $L = "%rbp";
-
-$frame = 8*3+2*256;
-
-$code.=<<___;
-.text
-
-.globl	ct_is_square_mod_384
-.type	ct_is_square_mod_384,\@function,2,"unwind"
-.align	32
-ct_is_square_mod_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	8*3+255(%rsp), %rax	# find closest 256-byte-aligned spot
-	and	\$-256, %rax		# in the frame...
-
-	mov	8*0(%rdi), @acc[0]	# load input
-	mov	8*1(%rdi), @acc[1]
-	mov	8*2(%rdi), @acc[2]
-	mov	8*3(%rdi), @acc[3]
-	mov	8*4(%rdi), @acc[4]
-	mov	8*5(%rdi), @acc[5]
-
-	mov	8*0(%rsi), @acc[6]	# load modulus
-	mov	8*1(%rsi), @acc[7]
-	mov	8*2(%rsi), %rbx
-	mov	8*3(%rsi), %rcx
-	mov	8*4(%rsi), %rdx
-	mov	8*5(%rsi), %rdi
-	mov	%rax, $in_ptr		# pointer to source |a|b|
-
-	mov	@acc[0], 8*0(%rax)	# copy input to |a|
-	mov	@acc[1], 8*1(%rax)
-	mov	@acc[2], 8*2(%rax)
-	mov	@acc[3], 8*3(%rax)
-	mov	@acc[4], 8*4(%rax)
-	mov	@acc[5], 8*5(%rax)
-
-	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
-	mov	@acc[7], 8*7(%rax)
-	mov	%rbx,    8*8(%rax)
-	mov	%rcx,    8*9(%rax)
-	mov	%rdx,    8*10(%rax)
-	mov	%rdi,    8*11(%rax)
-
-	xor	$L, $L			# initialize the Legendre symbol
-	mov	\$24, %ecx		# 24 is 768/30-1
-	jmp	.Loop_is_square
-
-.align	32
-.Loop_is_square:
-	mov	%ecx, 8*2(%rsp)		# offload loop counter
-
-	call	__ab_approximation_30
-	mov	$f0, 8*0(%rsp)		# offload |f0| and |g0|
-	mov	$g0, 8*1(%rsp)
-
-	mov	\$128+8*6, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |b|
-	call	__smulq_384_n_shift_by_30
-
-	mov	8*0(%rsp), $f1		# pop |f0| and |g0|
-	mov	8*1(%rsp), $g1
-	lea	-8*6($out_ptr),$out_ptr	# pointer to destination |a|
-	call	__smulq_384_n_shift_by_30
-
-	mov	8*2(%rsp), %ecx		# re-load loop counter
-	xor	\$128, $in_ptr		# flip-flop pointer to source |a|b|
-
-	and	8*6($out_ptr), @acc[6]	# if |a| was negative, adjust |L|
-	shr	\$1, @acc[6]
-	add	@acc[6], $L
-
-	sub	\$1, %ecx
-	jnz	.Loop_is_square
-
-	################################# last iteration
-	#call	__ab_approximation_30	# |a| and |b| are exact, just load
-	#mov	8*0($in_ptr), @acc[0]	# |a_|
-	mov	8*6($in_ptr), @acc[1]	# |b_|
-	call	__inner_loop_48		# 48 is 768%30+30
-
-	mov	\$1, %rax
-	and	$L,  %rax
-	xor	\$1, %rax		# return value
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	ct_is_square_mod_384,.-ct_is_square_mod_384
-
-.type	__smulq_384_n_shift_by_30,\@abi-omnipotent
-.align	32
-__smulq_384_n_shift_by_30:
-___
-for($j=0; $j<2; $j++) {
-$code.=<<___;
-	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	%rdx, %rbx		# |f1| (or |g1|)
-	sar	\$63, %rdx		# |f1|'s sign as mask (or |g1|'s)
-	xor	%rax, %rax
-	sub	%rdx, %rax		# |f1|'s sign as bit (or |g1|'s)
-
-	xor	%rdx, %rbx		# conditionally negate |f1| (or |g1|)
-	add	%rax, %rbx
-
-	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
-	xor	%rdx, @acc[1]
-	xor	%rdx, @acc[2]
-	xor	%rdx, @acc[3]
-	xor	%rdx, @acc[4]
-	xor	%rdx, @acc[5]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mov	%rdx, @acc[6+$j]
-	and	%rbx, @acc[6+$j]
-	mulq	%rbx			# |a|*|f1| (or |b|*|g1|)
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<5; $i++) {
-$code.=<<___;
-	mulq	%rbx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	neg	@acc[6+$j]
-	mulq	%rbx
-	add	%rax, @acc[5]
-	adc	%rdx, @acc[6+$j]
-___
-$code.=<<___	if ($j==0);
-	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
-	mov	$g1, %rdx
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-___
-}
-$code.=<<___;
-	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
-
-	add	8*0($out_ptr), @acc[0]
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	8*4($out_ptr), @acc[4]
-	adc	8*5($out_ptr), @acc[5]
-	adc	@acc[7],       @acc[6]
-
-	shrd	\$30, @acc[1], @acc[0]
-	shrd	\$30, @acc[2], @acc[1]
-	shrd	\$30, @acc[3], @acc[2]
-	shrd	\$30, @acc[4], @acc[3]
-	shrd	\$30, @acc[5], @acc[4]
-	shrd	\$30, @acc[6], @acc[5]
-
-	sar	\$63, @acc[6]		# sign as mask
-	xor	%rbx, %rbx
-	sub	@acc[6], %rbx		# sign as bit
-
-	xor	@acc[6], @acc[0]	# conditionally negate the result
-	xor	@acc[6], @acc[1]
-	xor	@acc[6], @acc[2]
-	xor	@acc[6], @acc[3]
-	xor	@acc[6], @acc[4]
-	xor	@acc[6], @acc[5]
-	add	%rbx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-
-	ret
-.size	__smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30
-___
-{
-my ($a_, $b_) = @acc[0..1];
-my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15));
-my ($fg0, $fg1, $bias) = ($g0, $g1, $t5);
-my $cnt = "%edi";
-{
-my @a = @acc[0..5];
-my @b = (@a[1..3], $t4, $t5, $g0);
-
-$code.=<<___;
-.type	__ab_approximation_30,\@abi-omnipotent
-.align	32
-__ab_approximation_30:
-	mov	8*11($in_ptr), @b[5]	# load |b| in reverse order
-	mov	8*10($in_ptr), @b[4]
-	mov	8*9($in_ptr),  @b[3]
-
-	mov	@a[5], %rax
-	or	@b[5], %rax		# check top-most limbs, ...
-	cmovz	@a[4], @a[5]
-	cmovz	@b[4], @b[5]
-	cmovz	@a[3], @a[4]
-	mov	8*8($in_ptr), @b[2]
-	cmovz	@b[3], @b[4]
-
-	mov	@a[5], %rax
-	or	@b[5], %rax		# ... ones before top-most, ...
-	cmovz	@a[4], @a[5]
-	cmovz	@b[4], @b[5]
-	cmovz	@a[2], @a[4]
-	mov	8*7($in_ptr), @b[1]
-	cmovz	@b[2], @b[4]
-
-	mov	@a[5], %rax
-	or	@b[5], %rax		# ... and ones before that ...
-	cmovz	@a[4], @a[5]
-	cmovz	@b[4], @b[5]
-	cmovz	@a[1], @a[4]
-	mov	8*6($in_ptr), @b[0]
-	cmovz	@b[1], @b[4]
-
-	mov	@a[5], %rax
-	or	@b[5], %rax		# ... and ones before that ...
-	cmovz	@a[4], @a[5]
-	cmovz	@b[4], @b[5]
-	cmovz	@a[0], @a[4]
-	cmovz	@b[0], @b[4]
-
-	mov	@a[5], %rax
-	or	@b[5], %rax
-	bsr	%rax, %rcx
-	lea	1(%rcx), %rcx
-	cmovz	@a[0], @a[5]
-	cmovz	@b[0], @b[5]
-	cmovz	%rax, %rcx
-	neg	%rcx
-	#and	\$63, %rcx		# debugging artefact
-
-	shldq	%cl, @a[4], @a[5]	# align second limb to the left
-	shldq	%cl, @b[4], @b[5]
-
-	mov	\$0xFFFFFFFF00000000, %rax
-	mov	@a[0]d, ${a_}d
-	mov	@b[0]d, ${b_}d
-	and	%rax, @a[5]
-	and	%rax, @b[5]
-	or	@a[5], ${a_}
-	or	@b[5], ${b_}
-
-	jmp	__inner_loop_30
-
-	ret
-.size	__ab_approximation_30,.-__ab_approximation_30
-___
-}
-$code.=<<___;
-.type	__inner_loop_30,\@abi-omnipotent
-.align	32
-__inner_loop_30:		################# by Thomas Pornin
-	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
-	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
-	lea	-1($fg0), $bias			# 0x7FFFFFFF7FFFFFFF
-	mov	\$30, $cnt
-
-.Loop_30:
-	 mov	$a_, %rax
-	 and	$b_, %rax
-	 shr	\$1, %rax		# (a_ & b_) >> 1
-
-	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
-	mov	$a_, $t0
-	mov	$b_, $t1
-	 lea	(%rax,$L), %rax		# pre-"negate" |L|
-	mov	$fg0, $t2
-	mov	$fg1, $t3
-	 mov	$L,   $t4
-	cmovb	$b_, $a_
-	cmovb	$t0, $b_
-	cmovb	$fg1, $fg0
-	cmovb	$t2, $fg1
-	 cmovb	%rax, $L
-
-	sub	$b_, $a_		# |a_|-|b_|
-	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
-	add	$bias, $fg0
-
-	test	\$1, $t0		# if |a_| was even, roll back 
-	cmovz	$t0, $a_
-	cmovz	$t1, $b_
-	cmovz	$t2, $fg0
-	cmovz	$t3, $fg1
-	cmovz	$t4, $L
-
-	 lea	2($b_), %rax
-	shr	\$1, $a_		# |a_|>>=1
-	 shr	\$2, %rax
-	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
-	 lea	(%rax,$L), $L		# "negate" |L| if |b|%8 is 3 or 5
-	sub	$bias, $fg1
-
-	sub	\$1, $cnt
-	jnz	.Loop_30
-
-	shr	\$32, $bias
-	mov	%ebx, %eax		# $fg0 -> $f0
-	shr	\$32, $g0
-	mov	%ecx, %edx		# $fg1 -> $f1
-	shr	\$32, $g1
-	sub	$bias, $f0		# remove the bias
-	sub	$bias, $g0
-	sub	$bias, $f1
-	sub	$bias, $g1
-
-	ret
-.size	__inner_loop_30,.-__inner_loop_30
-
-.type	__inner_loop_48,\@abi-omnipotent
-.align	32
-__inner_loop_48:
-	mov	\$48, $cnt		# 48 is 768%30+30
-
-.Loop_48:
-	 mov	$a_, %rax
-	 and	$b_, %rax
-	 shr	\$1, %rax		# (a_ & b_) >> 1
-
-	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
-	mov	$a_, $t0
-	mov	$b_, $t1
-	 lea	(%rax,$L), %rax
-	 mov	$L,  $t2
-	cmovb	$b_, $a_
-	cmovb	$t0, $b_
-	 cmovb	%rax, $L
-
-	sub	$b_, $a_		# |a_|-|b_|
-
-	test	\$1, $t0		# if |a_| was even, roll back 
-	cmovz	$t0, $a_
-	cmovz	$t1, $b_
-	cmovz	$t2, $L
-
-	 lea	2($b_), %rax
-	shr	\$1, $a_		# |a_|>>=1
-	 shr	\$2, %rax
-	 add	%rax, $L		# "negate" |L| if |b|%8 is 3 or 5
-
-	sub	\$1, $cnt
-	jnz	.Loop_48
-
-	ret
-.size	__inner_loop_48,.-__inner_loop_48
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
deleted file mode 100755
index 2be39d8ba8b..00000000000
--- a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl
+++ /dev/null
@@ -1,886 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast Euclidean inversion as suggested in
-# https://eprint.iacr.org/2020/972. Performance is >5x better than
-# modulus-specific FLT addition chain...
-#
-# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
-#
-$python_ref.=<<'___';
-def ct_inverse_mod_383(inp, mod):
-    a, u = inp, 1
-    b, v = mod, 0
-
-    k = 62
-    w = 64
-    mask = (1 << w) - 1
-
-    for i in range(0, 766 // k):
-        # __ab_approximation_62
-        n = max(a.bit_length(), b.bit_length())
-        if n < 128:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-w)) << w)
-            b_ = (b & mask) | ((b >> (n-w)) << w)
-
-        # __inner_loop_62
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-
-        # __smulq_383_n_shift_by_62
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if a < 0:
-            a, f0, g0 = -a, -f0, -g0
-        if b < 0:
-            b, f1, g1 = -b, -f1, -g1
-
-        # __smulq_767x63
-        u, v = u*f0 + v*g0, u*f1 + v*g1
-
-    if 766 % k:
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, 766 % k):
-            if a & 1:
-                if a < b:
-                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
-                a, f0, g0 = a-b, f0-f1, g0-g1
-            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
-
-        v = u*f1 + v*g1
-
-    if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
-
-    return v & (2**768 - 1) # to be reduced % mod
-___
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
-my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
-my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
-my $cnt = "%edi";
-
-$frame = 8*11+2*512;
-
-$code.=<<___;
-.text
-
-.globl	ct_inverse_mod_383
-.type	ct_inverse_mod_383,\@function,4,"unwind"
-.align	32
-ct_inverse_mod_383:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
-	and	\$-512, %rax		# in the frame...
-	mov	$out_ptr, 8*4(%rsp)
-	mov	$nx_ptr, 8*5(%rsp)
-
-	mov	8*0($in_ptr), @acc[0]	# load input
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	8*0($n_ptr), @acc[6]	# load modulus
-	mov	8*1($n_ptr), @acc[7]
-	mov	8*2($n_ptr), @acc[8]
-	mov	8*3($n_ptr), @acc[9]
-	mov	8*4($n_ptr), @acc[10]
-	mov	8*5($n_ptr), @acc[11]
-
-	mov	@acc[0], 8*0(%rax)	# copy input to |a|
-	mov	@acc[1], 8*1(%rax)
-	mov	@acc[2], 8*2(%rax)
-	mov	@acc[3], 8*3(%rax)
-	mov	@acc[4], 8*4(%rax)
-	mov	@acc[5], 8*5(%rax)
-
-	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
-	mov	@acc[7], 8*7(%rax)
-	mov	@acc[8], 8*8(%rax)
-	mov	@acc[9], 8*9(%rax)
-	mov	@acc[10], 8*10(%rax)
-	mov	%rax, $in_ptr		# pointer to source |a|b|1|0|
-	mov	@acc[11], 8*11(%rax)
-
-	################################# first iteration
-	mov	\$62, $cnt
-	call	__ab_approximation_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_383_n_shift_by_62
-	#mov	$f0, 8*7(%rsp)		# corrected |f0|
-	#mov	$g0, 8*8(%rsp)		# corrected |g0|
-	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_383_n_shift_by_62
-	#mov	$f0, 8*9(%rsp)		# corrected |f1|
-	#mov	$g0, 8*10(%rsp)		# corrected |g1|
-	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
-
-	################################# second iteration
-	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
-	mov	\$62, $cnt
-	call	__ab_approximation_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_383_n_shift_by_62
-	mov	$f0, 8*7(%rsp)		# corrected |f0|
-	mov	$g0, 8*8(%rsp)		# corrected |g0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_383_n_shift_by_62
-	#mov	$f0, 8*9(%rsp)		# corrected |f1|
-	#mov	$g0, 8*10(%rsp)		# corrected |g1|
-
-	mov	8*12($in_ptr), %rax	# |u|
-	mov	8*18($in_ptr), @acc[3]	# |v|
-	mov	$f0, %rbx
-	mov	%rax, @acc[2]
-	imulq	8*7(%rsp)		# |u|*|f0|
-	mov	%rax, @acc[0]
-	mov	@acc[3], %rax
-	mov	%rdx, @acc[1]
-	imulq	8*8(%rsp)		# |v|*|g0|
-	add	%rax, @acc[0]
-	adc	%rdx, @acc[1]
-	mov	@acc[0], 8*6($out_ptr)	# destination |u|
-	mov	@acc[1], 8*7($out_ptr)
-	sar	\$63, @acc[1]		# sign extension
-	mov	@acc[1], 8*8($out_ptr)
-	mov	@acc[1], 8*9($out_ptr)
-	mov	@acc[1], 8*10($out_ptr)
-	mov	@acc[1], 8*11($out_ptr)
-	lea	8*12($in_ptr),$in_ptr	# make in_ptr "rewindable" with xor
-
-	mov	@acc[2], %rax
-	imulq	%rbx			# |u|*|f1|
-	mov	%rax, @acc[0]
-	mov	@acc[3], %rax
-	mov	%rdx, @acc[1]
-	imulq	%rcx			# |v|*|g1|
-	add	%rax, @acc[0]
-	adc	%rdx, @acc[1]
-	mov	@acc[0], 8*12($out_ptr)	# destination |v|
-	mov	@acc[1], 8*13($out_ptr)
-	sar	\$63, @acc[1]		# sign extension
-	mov	@acc[1], 8*14($out_ptr)
-	mov	@acc[1], 8*15($out_ptr)
-	mov	@acc[1], 8*16($out_ptr)
-	mov	@acc[1], 8*17($out_ptr)
-___
-for($i=2; $i<11; $i++) {
-my $smul_767x63  = $i>5 ? "__smulq_767x63"
-                        : "__smulq_383x63";
-$code.=<<___;
-	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$62, $cnt
-	call	__ab_approximation_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulq_383_n_shift_by_62
-	mov	$f0, 8*7(%rsp)		# corrected |f0|
-	mov	$g0, 8*8(%rsp)		# corrected |g0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulq_383_n_shift_by_62
-	mov	$f0, 8*9(%rsp)		# corrected |f1|
-	mov	$g0, 8*10(%rsp)		# corrected |g1|
-
-	mov	8*7(%rsp), $f0		# |f0|
-	mov	8*8(%rsp), $g0		# |g0|
-	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
-	call	__smulq_383x63
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
-	call	$smul_767x63
-___
-$code.=<<___	if ($i==5);
-	sar	\$63, @acc[5]		# sign extension
-	mov	@acc[5], 8*6($out_ptr)
-	mov	@acc[5], 8*7($out_ptr)
-	mov	@acc[5], 8*8($out_ptr)
-	mov	@acc[5], 8*9($out_ptr)
-	mov	@acc[5], 8*10($out_ptr)
-	mov	@acc[5], 8*11($out_ptr)
-___
-}
-$code.=<<___;
-	################################# iteration before last
-	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$62, $cnt
-	#call	__ab_approximation_62	# |a| and |b| are exact, just load
-	mov	8*0($in_ptr), @acc[0]	# |a_lo|
-	mov	8*1($in_ptr), @acc[1]	# |a_hi|
-	mov	8*6($in_ptr), @acc[2]	# |b_lo|
-	mov	8*7($in_ptr), @acc[3]	# |b_hi|
-	call	__inner_loop_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[2], 8*6($out_ptr)
-
-	#mov	8*7(%rsp), $f0		# |f0|
-	#mov	8*8(%rsp), $g0		# |g0|
-	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
-	lea	8*12($out_ptr),$out_ptr	# pointer to destination |u|
-	call	__smulq_383x63
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
-	call	__smulq_767x63
-
-	################################# last iteration
-	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$22, $cnt		# 766 % 62
-	#call	__ab_approximation_62	# |a| and |b| are exact, just load
-	mov	8*0($in_ptr), @acc[0]	# |a_lo|
-	xor	@acc[1],      @acc[1]	# |a_hi|
-	mov	8*6($in_ptr), @acc[2]	# |b_lo|
-	xor	@acc[3],   @acc[3]	# |b_hi|
-	call	__inner_loop_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	#mov	$f1, 8*9(%rsp)
-	#mov	$g1, 8*10(%rsp)
-
-	#mov	8*7(%rsp), $f0		# |f0|
-	#mov	8*8(%rsp), $g0		# |g0|
-	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
-	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
-	#call	__smulq_383x63
-
-	#mov	8*9(%rsp), $f0		# |f1|
-	#mov	8*10(%rsp), $g0		# |g1|
-	mov	$f1, $f0
-	mov	$g1, $g0
-	mov	8*4(%rsp), $out_ptr	# original out_ptr
-	call	__smulq_767x63
-
-	mov	8*5(%rsp), $in_ptr	# original n_ptr
-	mov	%rax, %rdx		# top limb of the result
-	sar	\$63, %rax		# result's sign as mask
-
-	mov	%rax, @acc[0]		# mask |modulus|
-	mov	%rax, @acc[1]
-	mov	%rax, @acc[2]
-	and	8*0($in_ptr), @acc[0]
-	and	8*1($in_ptr), @acc[1]
-	mov	%rax, @acc[3]
-	and	8*2($in_ptr), @acc[2]
-	and	8*3($in_ptr), @acc[3]
-	mov	%rax, @acc[4]
-	and	8*4($in_ptr), @acc[4]
-	and	8*5($in_ptr), %rax
-
-	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
-	adc	@acc[1], @acc[7]
-	adc	@acc[2], @acc[8]
-	adc	@acc[3], @acc[9]
-	adc	@acc[4], %rcx
-	adc	%rax,    %rdx
-
-	mov	@acc[6], 8*6($out_ptr)	# store absolute value
-	mov	@acc[7], 8*7($out_ptr)
-	mov	@acc[8], 8*8($out_ptr)
-	mov	@acc[9], 8*9($out_ptr)
-	mov	%rcx,    8*10($out_ptr)
-	mov	%rdx,    8*11($out_ptr)
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	ct_inverse_mod_383,.-ct_inverse_mod_383
-___
-########################################################################
-# see corresponding commentary in ctx_inverse_mod_384-x86_64...
-{
-my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
-my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
-my $fx = @acc[9];
-
-$code.=<<___;
-.type	__smulq_767x63,\@abi-omnipotent
-.align	32
-__smulq_767x63:
-	mov	8*0($in_ptr), @acc[0]	# load |u|
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	$f0, $fx
-	sar	\$63, $f0		# |f0|'s sign as mask
-	xor	%rax, %rax
-	sub	$f0, %rax		# |f0|'s sign as bit
-
-	mov	$out_ptr, 8*1(%rsp)
-	mov	$in_ptr, 8*2(%rsp)
-	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
-
-	xor	$f0, $fx		# conditionally negate |f0|
-	add	%rax, $fx
-
-	xor	$f0, @acc[0]		# conditionally negate |u|
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	xor	$f0, @acc[4]
-	xor	$f0, @acc[5]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mulq	$fx			# |u|*|f0|
-	mov	%rax, 8*0($out_ptr)	# offload |u|*|f0|
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<5; $i++) {
-$code.=<<___;
-	mulq	$fx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-	mov	@acc[$i], 8*$i($out_ptr)
-___
-}
-$code.=<<___;
-	imulq	$fx
-	add	%rax, @acc[$i]
-	adc	\$0, %rdx
-
-	mov	@acc[5], 8*5($out_ptr)
-	mov	%rdx, 8*6($out_ptr)
-	sar	\$63, %rdx		# sign extension
-	mov	%rdx, 8*7($out_ptr)
-___
-{
-my $fx=$in_ptr;
-$code.=<<___;
-	mov	$g0, $f0		# load |g0|
-
-	mov	8*0($in_ptr), @acc[0]	# load |v|
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-	mov	8*6($in_ptr), @acc[6]
-	mov	8*7($in_ptr), @acc[7]
-	mov	8*8($in_ptr), @acc[8]
-	mov	8*9($in_ptr), @acc[9]
-	mov	8*10($in_ptr), @acc[10]
-	mov	8*11($in_ptr), @acc[11]
-
-	mov	$f0, $fx		# overrides in_ptr
-	sar	\$63, $f0		# |g0|'s sign as mask
-	xor	%rax, %rax
-	sub	$f0, %rax		# |g0|'s sign as bit
-
-	xor	$f0, $fx		# conditionally negate |g0|
-	add	%rax, $fx
-
-	xor	$f0, @acc[0]		# conditionally negate |v|
-	xor	$f0, @acc[1]
-	xor	$f0, @acc[2]
-	xor	$f0, @acc[3]
-	xor	$f0, @acc[4]
-	xor	$f0, @acc[5]
-	xor	$f0, @acc[6]
-	xor	$f0, @acc[7]
-	xor	$f0, @acc[8]
-	xor	$f0, @acc[9]
-	xor	$f0, @acc[10]
-	xor	$f0, @acc[11]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-	adc	\$0, @acc[6]
-	adc	\$0, @acc[7]
-	adc	\$0, @acc[8]
-	adc	\$0, @acc[9]
-	adc	\$0, @acc[10]
-	adc	\$0, @acc[11]
-
-	mulq	$fx			# |v|*|g0|
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<11; $i++) {
-$code.=<<___;
-	mulq	$fx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___;
-	mov	8*1(%rsp), %rdx		# out_ptr
-	imulq	$fx, %rax
-	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
-	add	@acc[11], %rax
-
-	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
-	adc	8*1(%rdx), @acc[1]
-	adc	8*2(%rdx), @acc[2]
-	adc	8*3(%rdx), @acc[3]
-	adc	8*4(%rdx), @acc[4]
-	adc	8*5(%rdx), @acc[5]
-	adc	8*6(%rdx), @acc[6]
-	mov	8*7(%rdx), @acc[11]	# sign extension
-	adc	@acc[11], @acc[7]
-	adc	@acc[11], @acc[8]
-	adc	@acc[11], @acc[9]
-	adc	@acc[11], @acc[10]
-	adc	@acc[11], %rax
-
-	mov	%rdx, $out_ptr		# restore original out_ptr
-
-	mov	@acc[0], 8*0(%rdx)
-	mov	@acc[1], 8*1(%rdx)
-	mov	@acc[2], 8*2(%rdx)
-	mov	@acc[3], 8*3(%rdx)
-	mov	@acc[4], 8*4(%rdx)
-	mov	@acc[5], 8*5(%rdx)
-	mov	@acc[6], 8*6(%rdx)
-	mov	@acc[7], 8*7(%rdx)
-	mov	@acc[8], 8*8(%rdx)
-	mov	@acc[9], 8*9(%rdx)
-	mov	@acc[10], 8*10(%rdx)
-	mov	%rax,     8*11(%rdx)
-
-	ret
-.size	__smulq_767x63,.-__smulq_767x63
-___
-}
-$code.=<<___;
-.type	__smulq_383x63,\@abi-omnipotent
-.align	32
-__smulq_383x63:
-___
-for($j=0; $j<2; $j++) {
-$code.=<<___;
-	mov	8*0($in_ptr), @acc[0]	# load |u| (or |v|)
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	%rdx, $fx
-	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
-	xor	%rax, %rax
-	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	%rdx, $fx		# conditionally negate |f0|
-	add	%rax, $fx
-
-	xor	%rdx, @acc[0]		# conditionally negate |u| (or |v|)
-	xor	%rdx, @acc[1]
-	xor	%rdx, @acc[2]
-	xor	%rdx, @acc[3]
-	xor	%rdx, @acc[4]
-	xor	%rdx, @acc[5]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mulq	$fx			# |u|*|f0| (or |v|*|g0|)
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<5; $i++) {
-$code.=<<___;
-	mulq	$fx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___	if ($j==0);
-	imulq	$fx, %rax
-	add	%rax, @acc[$i]
-
-	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
-	mov	$g0, %rdx
-
-	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-___
-}
-$code.=<<___;
-	imulq	$fx, %rax
-	add	%rax, @acc[$i]
-
-	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
-
-	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	8*4($out_ptr), @acc[4]
-	adc	8*5($out_ptr), @acc[5]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-
-	ret
-.size	__smulq_383x63,.-__smulq_383x63
-___
-{
-$code.=<<___;
-.type	__smulq_383_n_shift_by_62,\@abi-omnipotent
-.align	32
-__smulq_383_n_shift_by_62:
-	mov	$f0, @acc[8]
-___
-my $f0 = @acc[8];
-for($j=0; $j<2; $j++) {
-$code.=<<___;
-	mov	8*0($in_ptr), @acc[0]	# load |a| (or |b|)
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	%rdx, $fx
-	sar	\$63, %rdx		# |f0|'s sign as mask (or |g0|'s)
-	xor	%rax, %rax
-	sub	%rdx, %rax		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	%rdx, $fx		# conditionally negate |f0| (or |g0|)
-	add	%rax, $fx
-
-	xor	%rdx, @acc[0]		# conditionally negate |a| (or |b|)
-	xor	%rdx, @acc[1]
-	xor	%rdx, @acc[2]
-	xor	%rdx, @acc[3]
-	xor	%rdx, @acc[4]
-	xor	%rdx, @acc[5]
-	add	@acc[0], %rax
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mulq	$fx			# |a|*|f0| (or |b|*|g0|)
-	mov	%rax, @acc[0]
-	mov	@acc[1], %rax
-	mov	%rdx, @acc[1]
-___
-for($i=1; $i<5; $i++) {
-$code.=<<___;
-	mulq	$fx
-	add	%rax, @acc[$i]
-	mov	@acc[$i+1], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[$i+1]
-___
-}
-$code.=<<___	if ($j==0);
-	imulq	$fx
-	add	%rax, @acc[$i]
-	adc	\$0, %rdx
-
-	lea	8*6($in_ptr), $in_ptr	# pointer to |b|
-	mov	%rdx, @acc[6]
-	mov	$g0, %rdx
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-___
-}
-$code.=<<___;
-	imulq	$fx
-	add	%rax, @acc[$i]
-	adc	\$0, %rdx
-
-	lea	-8*6($in_ptr), $in_ptr	# restore original in_ptr
-
-	add	8*0($out_ptr), @acc[0]
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	8*4($out_ptr), @acc[4]
-	adc	8*5($out_ptr), @acc[5]
-	adc	%rdx,          @acc[6]
-	mov	$f0, %rdx
-
-	shrd	\$62, @acc[1], @acc[0]
-	shrd	\$62, @acc[2], @acc[1]
-	shrd	\$62, @acc[3], @acc[2]
-	shrd	\$62, @acc[4], @acc[3]
-	shrd	\$62, @acc[5], @acc[4]
-	shrd	\$62, @acc[6], @acc[5]
-
-	sar	\$63, @acc[6]		# sign as mask
-	xor	$fx, $fx
-	sub	@acc[6], $fx		# sign as bit
-
-	xor	@acc[6], @acc[0]	# conditionally negate the result
-	xor	@acc[6], @acc[1]
-	xor	@acc[6], @acc[2]
-	xor	@acc[6], @acc[3]
-	xor	@acc[6], @acc[4]
-	xor	@acc[6], @acc[5]
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-
-	xor	@acc[6], %rdx		# conditionally negate |f0|
-	xor	@acc[6], $g0		# conditionally negate |g0|
-	add	$fx, %rdx
-	add	$fx, $g0
-
-	ret
-.size	__smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62
-___
-} }
-
-{
-my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
-my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi");
-{
-my @a = ($a_lo, $t1, $a_hi);
-my @b = ($b_lo, $t2, $b_hi);
-
-$code.=<<___;
-.type	__ab_approximation_62,\@abi-omnipotent
-.align	32
-__ab_approximation_62:
-	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
-	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
-	mov	8*4($in_ptr), @a[1]
-	mov	8*10($in_ptr), @b[1]
-	mov	8*3($in_ptr), @a[0]
-	mov	8*9($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# check top-most limbs, ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	cmovz	@b[0], @b[1]
-	mov	8*2($in_ptr), @a[0]
-	mov	8*8($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... ones before top-most, ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	cmovz	@b[0], @b[1]
-	mov	8*1($in_ptr), @a[0]
-	mov	8*7($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... and ones before that ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	cmovz	@b[0], @b[1]
-	mov	8*0($in_ptr), @a[0]
-	mov	8*6($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0
-	bsr	$t0, %rcx
-	lea	1(%rcx), %rcx
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	$t0, %rcx
-	neg	%rcx
-	#and	\$63, %rcx		# debugging artefact
-
-	shldq	%cl, @a[1], @a[2]	# align second limb to the left
-	shldq	%cl, @b[1], @b[2]
-
-	jmp	__inner_loop_62
-
-	ret
-.size	__ab_approximation_62,.-__ab_approximation_62
-___
-}
-$code.=<<___;
-.type	__inner_loop_62,\@abi-omnipotent
-.align	8
-.long	0
-__inner_loop_62:
-	mov	\$1, $f0	# |f0|=1
-	xor	$g0, $g0	# |g0|=0
-	xor	$f1, $f1	# |f1|=0
-	mov	\$1, $g1	# |g1|=1
-	mov	$in_ptr, 8(%rsp)
-
-.Loop_62:
-	xor	$t0, $t0
-	xor	$t1, $t1
-	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
-	mov	$b_lo, $t2
-	mov	$b_hi, $t3
-	cmovnz	$b_lo, $t0
-	cmovnz	$b_hi, $t1
-	sub	$a_lo, $t2	# |b_|-|a_|
-	sbb	$a_hi, $t3
-	mov	$a_lo, $t4
-	mov	$a_hi, $t5
-	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
-	sbb	$t1, $a_hi
-	cmovc	$t2, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
-	cmovc	$t3, $a_hi
-	cmovc	$t4, $b_lo	# |b_| = |a_|
-	cmovc	$t5, $b_hi
-	mov	$f0, $t0	# exchange |f0| and |f1|
-	cmovc	$f1, $f0
-	cmovc	$t0, $f1
-	mov	$g0, $t1	# exchange |g0| and |g1|
-	cmovc	$g1, $g0
-	cmovc	$t1, $g1
-	xor	$t0, $t0
-	xor	$t1, $t1
-	shrd	\$1, $a_hi, $a_lo
-	shr	\$1, $a_hi
-	test	\$1, $t4	# if |a_| was odd, then we'll be subtracting...
-	cmovnz	$f1, $t0
-	cmovnz	$g1, $t1
-	add	$f1, $f1	# |f1|<<=1
-	add	$g1, $g1	# |g1|<<=1
-	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
-	sub	\$1, $cnt
-	jnz	.Loop_62
-
-	mov	8(%rsp), $in_ptr
-	ret
-.size	__inner_loop_62,.-__inner_loop_62
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
deleted file mode 100755
index d207e2f5a7c..00000000000
--- a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl
+++ /dev/null
@@ -1,995 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Both constant-time and fast Euclidean inversion as suggested in
-# https://eprint.iacr.org/2020/972. Performance is >4x better than
-# modulus-specific FLT addition chain...
-#
-# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod);
-#
-$python_ref.=<<'___';
-def ct_inverse_mod_383(inp, mod):
-    a, u = inp, 1
-    b, v = mod, 0
-
-    k = 31
-    mask = (1 << k) - 1
-
-    for i in range(0, 766 // k):
-        # __ab_approximation_31
-        n = max(a.bit_length(), b.bit_length())
-        if n < 64:
-            a_, b_ = a, b
-        else:
-            a_ = (a & mask) | ((a >> (n-k-2)) << k)
-            b_ = (b & mask) | ((b >> (n-k-2)) << k)
-
-        # __inner_loop_31
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, k):
-            if a_ & 1:
-                if a_ < b_:
-                    a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0
-                a_, f0, g0 = a_-b_, f0-f1, g0-g1
-            a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1
-
-        # __smulx_383_n_shift_by_31
-        a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k
-        if a < 0:
-            a, f0, g0 = -a, -f0, -g0
-        if b < 0:
-            b, f1, g1 = -b, -f1, -g1
-
-        # __smulx_767x63
-        u, v = u*f0 + v*g0, u*f1 + v*g1
-
-    if 766 % k:
-        f0, g0, f1, g1 = 1, 0, 0, 1
-        for j in range(0, 766 % k):
-            if a & 1:
-                if a < b:
-                    a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0
-                a, f0, g0 = a-b, f0-f1, g0-g1
-            a, f1, g1 = a >> 1, f1 << 1, g1 << 1
-
-        v = u*f1 + v*g1
-
-    if v < 0:
-        v += mod << (768 - mod.bit_length())    # left aligned
-
-    return v & (2**768 - 1) # to be reduced % mod
-___
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx");
-my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr);
-my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13");
-my $cnt = "%edi";
-
-$frame = 8*11+2*512;
-
-$code.=<<___;
-.text
-
-.globl	ctx_inverse_mod_383
-.type	ctx_inverse_mod_383,\@function,4,"unwind"
-.align	32
-ctx_inverse_mod_383:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	8*11+511(%rsp), %rax	# find closest 512-byte-aligned spot
-	and	\$-512, %rax		# in the frame...
-	mov	$out_ptr, 8*4(%rsp)
-	mov	$nx_ptr, 8*5(%rsp)
-
-	mov	8*0($in_ptr), @acc[0]	# load input
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	8*0($n_ptr), @acc[6]	# load modulus
-	mov	8*1($n_ptr), @acc[7]
-	mov	8*2($n_ptr), @acc[8]
-	mov	8*3($n_ptr), @acc[9]
-	mov	8*4($n_ptr), @acc[10]
-	mov	8*5($n_ptr), @acc[11]
-
-	mov	@acc[0], 8*0(%rax)	# copy input to |a|
-	mov	@acc[1], 8*1(%rax)
-	mov	@acc[2], 8*2(%rax)
-	mov	@acc[3], 8*3(%rax)
-	mov	@acc[4], 8*4(%rax)
-	mov	@acc[5], 8*5(%rax)
-
-	mov	@acc[6], 8*6(%rax)	# copy modulus to |b|
-	mov	@acc[7], 8*7(%rax)
-	mov	@acc[8], 8*8(%rax)
-	mov	@acc[9], 8*9(%rax)
-	mov	@acc[10], 8*10(%rax)
-	mov	%rax, $in_ptr
-	mov	@acc[11], 8*11(%rax)
-
-	################################# first iteration
-	mov	\$31, $cnt
-	call	__ab_approximation_31
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulx_383_n_shift_by_31
-	#mov	$f0, 8*7(%rsp)		# corrected |f0|
-	#mov	$g0, 8*8(%rsp)		# corrected |g0|
-	mov	$f0, 8*12($out_ptr)	# initialize |u| with |f0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulx_383_n_shift_by_31
-	#mov	$f0, 8*9(%rsp)		# corrected |f1|
-	#mov	$g0, 8*10(%rsp)		# corrected |g1|
-	mov	$f0, 8*12($out_ptr)	# initialize |v| with |f1|
-
-	################################# second iteration
-	xor	\$256, $in_ptr		# flip-flop pointer to source |a|b|u|v|
-	mov	\$31, $cnt
-	call	__ab_approximation_31
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	__smulx_383_n_shift_by_31
-	mov	$f0, 8*7(%rsp)		# corrected |f0|
-	mov	$g0, 8*8(%rsp)		# corrected |g0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	__smulx_383_n_shift_by_31
-	#mov	$f0, 8*9(%rsp)		# corrected |f1|
-	#mov	$g0, 8*10(%rsp)		# corrected |g1|
-
-	mov	8*12($in_ptr), %rax	# |u|
-	mov	8*18($in_ptr), @acc[3]	# |v|
-	mov	$f0, %rbx
-	mov	%rax, @acc[2]
-	imulq	8*7(%rsp)		# |u|*|f0|
-	mov	%rax, @acc[0]
-	mov	@acc[3], %rax
-	mov	%rdx, @acc[1]
-	imulq	8*8(%rsp)		# |v|*|g0|
-	add	%rax, @acc[0]
-	adc	%rdx, @acc[1]
-	mov	@acc[0], 8*6($out_ptr)	# destination |u|
-	mov	@acc[1], 8*7($out_ptr)
-	sar	\$63, @acc[1]		# sign extension
-	mov	@acc[1], 8*8($out_ptr)
-	mov	@acc[1], 8*9($out_ptr)
-	mov	@acc[1], 8*10($out_ptr)
-	mov	@acc[1], 8*11($out_ptr)
-	lea	8*12($in_ptr), $in_ptr	# make in_ptr "rewindable" with xor
-
-	mov	@acc[2], %rax
-	imulq	%rbx			# |u|*|f1|
-	mov	%rax, @acc[0]
-	mov	@acc[3], %rax
-	mov	%rdx, @acc[1]
-	imulq	%rcx			# |v|*|g1|
-	add	%rax, @acc[0]
-	adc	%rdx, @acc[1]
-	mov	@acc[0], 8*12($out_ptr)	# destination |v|
-	mov	@acc[1], 8*13($out_ptr)
-	sar	\$63, @acc[1]		# sign extension
-	mov	@acc[1], 8*14($out_ptr)
-	mov	@acc[1], 8*15($out_ptr)
-	mov	@acc[1], 8*16($out_ptr)
-	mov	@acc[1], 8*17($out_ptr)
-___
-for($i=2; $i<23; $i++) {
-my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31"
-                         : "__smulx_191_n_shift_by_31";
-my $smul_767x63  = $i>11 ? "__smulx_767x63"
-                         : "__smulx_383x63";
-$code.=<<___;
-	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$31, $cnt
-	call	__ab_approximation_31
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	mov	$f1, 8*9(%rsp)
-	mov	$g1, 8*10(%rsp)
-
-	mov	\$256, $out_ptr
-	xor	$in_ptr, $out_ptr	# pointer to destination |a|b|u|v|
-	call	$smul_n_shift
-	mov	$f0, 8*7(%rsp)		# corrected |f0|
-	mov	$g0, 8*8(%rsp)		# corrected |g0|
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |b|
-	call	$smul_n_shift
-	mov	$f0, 8*9(%rsp)		# corrected |f1|
-	mov	$g0, 8*10(%rsp)		# corrected |g1|
-
-	mov	8*7(%rsp), $f0		# |f0|
-	mov	8*8(%rsp), $g0		# |g0|
-	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
-	lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
-	call	__smulx_383x63
-
-	mov	8*9(%rsp), $f0		# |f1|
-	mov	8*10(%rsp), $g0		# |g1|
-	lea	8*6($out_ptr),$out_ptr	# pointer to destination |v|
-	call	$smul_767x63
-___
-$code.=<<___	if ($i==11);
-	sar	\$63, @acc[5]		# sign extension
-	mov	@acc[5], 8*6($out_ptr)
-	mov	@acc[5], 8*7($out_ptr)
-	mov	@acc[5], 8*8($out_ptr)
-	mov	@acc[5], 8*9($out_ptr)
-	mov	@acc[5], 8*10($out_ptr)
-	mov	@acc[5], 8*11($out_ptr)
-___
-}
-$code.=<<___;
-	################################# two[!] last iterations in one go
-	xor	\$256+8*12, $in_ptr	# flip-flop pointer to source |a|b|u|v|
-	mov	\$53, $cnt		# 31 + 766 % 31
-	#call	__ab_approximation_31	# |a| and |b| are exact, just load
-	mov	8*0($in_ptr), @acc[0]	# |a_lo|
-	#xor	@acc[1],      @acc[1]	# |a_hi|
-	mov	8*6($in_ptr), @acc[2]	# |b_lo|
-	#xor	@acc[3],      @acc[3]	# |b_hi|
-	call	__inner_loop_62
-	#mov	$f0, 8*7(%rsp)
-	#mov	$g0, 8*8(%rsp)
-	#mov	$f1, 8*9(%rsp)
-	#mov	$g1, 8*10(%rsp)
-
-	#mov	8*7(%rsp), $f0		# |f0|
-	#mov	8*8(%rsp), $g0		# |g0|
-	lea	8*12($in_ptr), $in_ptr	# pointer to source |u|v|
-	#lea	8*6($out_ptr), $out_ptr	# pointer to destination |u|
-	#call	__smulx_383x63
-
-	#mov	8*9(%rsp), $f0		# |f1|
-	#mov	8*10(%rsp), $g0		# |g1|
-	mov	$f1, $f0
-	mov	$g1, $g0
-	mov	8*4(%rsp), $out_ptr	# original out_ptr
-	call	__smulx_767x63
-
-	mov	8*5(%rsp), $in_ptr	# original n_ptr
-	mov	%rax, %rdx		# top limb of the result
-	sar	\$63, %rax		# result's sign as mask
-
-	mov	%rax, @acc[0]		# mask |modulus|
-	mov	%rax, @acc[1]
-	mov	%rax, @acc[2]
-	and	8*0($in_ptr), @acc[0]
-	and	8*1($in_ptr), @acc[1]
-	mov	%rax, @acc[3]
-	and	8*2($in_ptr), @acc[2]
-	and	8*3($in_ptr), @acc[3]
-	mov	%rax, @acc[4]
-	and	8*4($in_ptr), @acc[4]
-	and	8*5($in_ptr), %rax
-
-	add	@acc[0], @acc[6]	# conditionally add |modulus|<<384
-	adc	@acc[1], @acc[7]
-	adc	@acc[2], @acc[8]
-	adc	@acc[3], @acc[9]
-	adc	@acc[4], %rcx
-	adc	%rax,    %rdx
-
-	mov	@acc[6], 8*6($out_ptr)	# store absolute value
-	mov	@acc[7], 8*7($out_ptr)
-	mov	@acc[8], 8*8($out_ptr)
-	mov	@acc[9], 8*9($out_ptr)
-	mov	%rcx,    8*10($out_ptr)
-	mov	%rdx,    8*11($out_ptr)
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	ctx_inverse_mod_383,.-ctx_inverse_mod_383
-___
-########################################################################
-# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers
-# to the maximum bit-length of the *result*, and "63" - to the maximum
-# bit-length of the |f?| and |g?| single-limb multiplicands. However!
-# The latter should not be taken literally, as they are always chosen so
-# that "bad things" don't happen. For example, there comes a point when
-# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we
-# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is
-# because past that point |f0| is always 1 and |g0| is always 0. And,
-# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to
-# perform full-width |u|*|f1| multiplication, half-width one with sign
-# extension is sufficient...
-{
-my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx");
-my @acc = map("%r$_",(8..15),"bx","bp","cx","di");
-my $fx = @acc[9];
-
-$code.=<<___;
-.type	__smulx_767x63,\@abi-omnipotent
-.align	32
-__smulx_767x63:
-	mov	8*0($in_ptr), @acc[0]	# load |u|
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-
-	mov	$f0, %rax
-	sar	\$63, %rax		# |f0|'s sign as mask
-	xor	$fx, $fx		# overrides in_ptr
-	sub	%rax, $fx		# |f0|'s sign as bit
-
-	mov	$out_ptr, 8*1(%rsp)
-	mov	$in_ptr,  8*2(%rsp)
-	lea	8*6($in_ptr), $in_ptr	# pointer to |v|
-
-	xor	%rax, $f0		# conditionally negate |f0|
-	add	$fx, $f0
-
-	xor	%rax, @acc[0]		# conditionally negate |u|
-	xor	%rax, @acc[1]
-	xor	%rax, @acc[2]
-	xor	%rax, @acc[3]
-	xor	%rax, @acc[4]
-	xor	@acc[5], %rax
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, %rax
-
-	mulx	@acc[0], @acc[0], $fx	# |u|*|f0|
-	mulx	@acc[1], @acc[1], @acc[5]
-	add	$fx, @acc[1]
-___
-for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
-$code.=<<___;
-	mulx	@acc[$i], @acc[$i], $a
-	adc	$b, @acc[$i]
-___
-    ($a, $b) = ($b, $a);
-}
-$code.=<<___;
-	adc	\$0, $fx
-	imulq	%rdx
-	add	$fx, %rax
-	adc	\$0, %rdx
-
-	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	%rax,    8*5($out_ptr)
-	mov	%rdx,    8*6($out_ptr)
-	sar	\$63, %rdx		# sign extension
-	mov	%rdx, 8*7($out_ptr)
-___
-{
-my $fx=$in_ptr;
-$code.=<<___;
-	mov	$g0, $f0		# load |g0|
-	mov	$g0, %rax
-
-	mov	8*0($in_ptr), @acc[0]	# load |v|
-	mov	8*1($in_ptr), @acc[1]
-	mov	8*2($in_ptr), @acc[2]
-	mov	8*3($in_ptr), @acc[3]
-	mov	8*4($in_ptr), @acc[4]
-	mov	8*5($in_ptr), @acc[5]
-	mov	8*6($in_ptr), @acc[6]
-	mov	8*7($in_ptr), @acc[7]
-	mov	8*8($in_ptr), @acc[8]
-	mov	8*9($in_ptr), @acc[9]
-	mov	8*10($in_ptr), @acc[10]
-	mov	8*11($in_ptr), @acc[11]
-
-	sar	\$63, %rax		# |g0|'s sign as mask
-	xor	$fx, $fx		# overrides in_ptr
-	sub	%rax, $fx		# |g0|'s sign as bit
-
-	xor	%rax, $f0		# conditionally negate |g0|
-	add	$fx, $f0
-
-	xor	%rax, @acc[0]		# conditionally negate |v|
-	xor	%rax, @acc[1]
-	xor	%rax, @acc[2]
-	xor	%rax, @acc[3]
-	xor	%rax, @acc[4]
-	xor	%rax, @acc[5]
-	xor	%rax, @acc[6]
-	xor	%rax, @acc[7]
-	xor	%rax, @acc[8]
-	xor	%rax, @acc[9]
-	xor	%rax, @acc[10]
-	xor	%rax, @acc[11]
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-	adc	\$0, @acc[6]
-	adc	\$0, @acc[7]
-	adc	\$0, @acc[8]
-	adc	\$0, @acc[9]
-	adc	\$0, @acc[10]
-	adc	\$0, @acc[11]
-
-	mulx	@acc[0], @acc[0], %rax	# |v|*|g0|
-	mulx	@acc[1], @acc[1], $fx
-	add	%rax, @acc[1]
-___
-for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) {
-$code.=<<___;
-	mulx	@acc[$i], @acc[$i], $a
-	adc	$b, @acc[$i]
-___
-    ($a, $b) = ($b, $a);
-}
-$code.=<<___;
-	mulx	@acc[11], @acc[11], $fx
-	mov	8*1(%rsp), %rdx		# out_ptr
-	mov	8*2(%rsp), $in_ptr	# restore original in_ptr
-	adc	@acc[11], %rax
-
-	add	8*0(%rdx), @acc[0]	# accumulate |u|*|f0|
-	adc	8*1(%rdx), @acc[1]
-	adc	8*2(%rdx), @acc[2]
-	adc	8*3(%rdx), @acc[3]
-	adc	8*4(%rdx), @acc[4]
-	adc	8*5(%rdx), @acc[5]
-	adc	8*6(%rdx), @acc[6]
-	mov	8*7(%rdx), @acc[11]	# sign extension
-	adc	@acc[11], @acc[7]
-	adc	@acc[11], @acc[8]
-	adc	@acc[11], @acc[9]
-	adc	@acc[11], @acc[10]
-	adc	@acc[11], %rax
-
-	mov	%rdx, $out_ptr		# restore original out_ptr
-
-	mov	@acc[0], 8*0(%rdx)
-	mov	@acc[1], 8*1(%rdx)
-	mov	@acc[2], 8*2(%rdx)
-	mov	@acc[3], 8*3(%rdx)
-	mov	@acc[4], 8*4(%rdx)
-	mov	@acc[5], 8*5(%rdx)
-	mov	@acc[6], 8*6(%rdx)
-	mov	@acc[7], 8*7(%rdx)
-	mov	@acc[8], 8*8(%rdx)
-	mov	@acc[9], 8*9(%rdx)
-	mov	@acc[10], 8*10(%rdx)
-	mov	%rax,     8*11(%rdx)
-
-	ret
-.size	__smulx_767x63,.-__smulx_767x63
-___
-}
-$code.=<<___;
-.type	__smulx_383x63,\@abi-omnipotent
-.align	32
-__smulx_383x63:
-___
-for($j=0; $j<2; $j++) {
-my $k = 8*6*$j;
-$code.=<<___;
-	mov	$k+8*0($in_ptr), @acc[0] # load |u| (or |v|)
-	mov	$k+8*1($in_ptr), @acc[1]
-	mov	$k+8*2($in_ptr), @acc[2]
-	mov	$k+8*3($in_ptr), @acc[3]
-	mov	$k+8*4($in_ptr), @acc[4]
-	mov	$k+8*5($in_ptr), @acc[5]
-
-	mov	$f0, $fx
-	sar	\$63, $fx		# |f0|'s sign as mask (or |g0|'s)
-	xor	%rax, %rax
-	sub	$fx, %rax		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	$fx, $f0		# conditionally negate |f0|
-	add	%rax, $f0
-
-	xor	$fx, @acc[0]		# conditionally negate |u| (or |v|)
-	xor	$fx, @acc[1]
-	xor	$fx, @acc[2]
-	xor	$fx, @acc[3]
-	xor	$fx, @acc[4]
-	xor	$fx, @acc[5]
-	add	%rax, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mulx	@acc[0], @acc[0], $fx	# |u|*|f0| (or |v|*|g0|)
-	mulx	@acc[1], @acc[1], %rax
-	add	$fx, @acc[1]
-___
-for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) {
-$code.=<<___;
-	mulx	@acc[$i], @acc[$i], $a
-	adc	$b, @acc[$i]
-___
-    ($a, $b) = ($b, $a);
-}
-$code.=<<___	if ($j==0);
-	mulx	@acc[$i], @acc[$i], %rax
-	mov	$g0, $f0
-	adc	$fx, @acc[$i]
-
-	mov	@acc[0], 8*0($out_ptr)	# offload |u|*|f0|
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-___
-}
-$code.=<<___;
-	mulx	@acc[$i], @acc[$i], %rax
-	adc	$fx, @acc[$i]
-
-	add	8*0($out_ptr), @acc[0]	# accumulate |u|*|f0|
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	8*4($out_ptr), @acc[4]
-	adc	8*5($out_ptr), @acc[5]
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	@acc[5], 8*5($out_ptr)
-
-	ret
-.size	__smulx_383x63,.-__smulx_383x63
-___
-########################################################################
-# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of
-# the names refers to maximum bit-lengths of |a| and |b|. As already
-# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always
-# chosen so that "bad things" don't happen. For example, so that the
-# sum of the products doesn't overflow, and that the final result is
-# never wider than inputs...
-{
-$code.=<<___;
-.type	__smulx_383_n_shift_by_31,\@abi-omnipotent
-.align	32
-__smulx_383_n_shift_by_31:
-	mov	$f0, @acc[8]
-	xor	@acc[6], @acc[6]
-___
-my $f0 = @acc[8];
-for($j=0; $j<2; $j++) {
-my $k = 8*6*$j;
-$code.=<<___;
-	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
-	mov	$k+8*1($in_ptr), @acc[1]
-	mov	$k+8*2($in_ptr), @acc[2]
-	mov	$k+8*3($in_ptr), @acc[3]
-	mov	$k+8*4($in_ptr), @acc[4]
-	mov	$k+8*5($in_ptr), @acc[5]
-
-	mov	%rdx, %rax
-	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
-	xor	$fx, $fx
-	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
-	add	$fx, %rdx
-
-	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
-	xor	%rax, @acc[1]
-	xor	%rax, @acc[2]
-	xor	%rax, @acc[3]
-	xor	%rax, @acc[4]
-	xor	@acc[5], %rax
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, %rax
-
-	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
-	mulx	@acc[1], @acc[1], @acc[5]
-	add	$fx, @acc[1]
-___
-for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) {
-$code.=<<___;
-	mulx	@acc[$i], @acc[$i], $a
-	adc	$b, @acc[$i]
-___
-    ($a, $b) = ($b, $a);
-}
-$code.=<<___	if ($j==0);
-	adc	\$0, $fx
-	imulq	%rdx
-	add	$fx, %rax
-	adc	%rdx, @acc[6]
-
-	mov	$g0, %rdx
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	%rax,    8*5($out_ptr)
-___
-}
-$code.=<<___;
-	adc	\$0, $fx
-	imulq	%rdx
-	add	$fx, %rax
-	adc	\$0, %rdx
-
-	add	8*0($out_ptr), @acc[0]
-	adc	8*1($out_ptr), @acc[1]
-	adc	8*2($out_ptr), @acc[2]
-	adc	8*3($out_ptr), @acc[3]
-	adc	8*4($out_ptr), @acc[4]
-	adc	8*5($out_ptr), %rax
-	adc	%rdx,          @acc[6]
-	mov	$f0, %rdx
-
-	shrd	\$31, @acc[1], @acc[0]
-	shrd	\$31, @acc[2], @acc[1]
-	shrd	\$31, @acc[3], @acc[2]
-	shrd	\$31, @acc[4], @acc[3]
-	shrd	\$31, %rax,    @acc[4]
-	shrd	\$31, @acc[6], %rax
-
-	sar	\$63, @acc[6]		# sign as mask
-	xor	$fx, $fx
-	sub	@acc[6], $fx		# sign as bit
-
-	xor	@acc[6], @acc[0]	# conditionally negate the result
-	xor	@acc[6], @acc[1]
-	xor	@acc[6], @acc[2]
-	xor	@acc[6], @acc[3]
-	xor	@acc[6], @acc[4]
-	xor	@acc[6], %rax
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, @acc[2]
-	adc	\$0, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, %rax
-
-	mov	@acc[0], 8*0($out_ptr)
-	mov	@acc[1], 8*1($out_ptr)
-	mov	@acc[2], 8*2($out_ptr)
-	mov	@acc[3], 8*3($out_ptr)
-	mov	@acc[4], 8*4($out_ptr)
-	mov	%rax,    8*5($out_ptr)
-
-	xor	@acc[6], %rdx		# conditionally negate |f0|
-	xor	@acc[6], $g0		# conditionally negate |g0|
-	add	$fx, %rdx
-	add	$fx, $g0
-
-	ret
-.size	__smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31
-___
-} {
-$code.=<<___;
-.type	__smulx_191_n_shift_by_31,\@abi-omnipotent
-.align	32
-__smulx_191_n_shift_by_31:
-	mov	$f0, @acc[8]
-___
-my $f0 = @acc[8];
-for($j=0; $j<2; $j++) {
-my $k = 8*6*$j;
-my @acc=@acc;
-   @acc=@acc[3..5] if ($j);
-$code.=<<___;
-	mov	$k+8*0($in_ptr), @acc[0] # load |a| (or |b|)
-	mov	$k+8*1($in_ptr), @acc[1]
-	mov	$k+8*2($in_ptr), @acc[2]
-
-	mov	%rdx, %rax
-	sar	\$63, %rax		# |f0|'s sign as mask (or |g0|'s)
-	xor	$fx, $fx
-	sub	%rax, $fx		# |f0|'s sign as bit (or |g0|'s)
-
-	xor	%rax, %rdx		# conditionally negate |f0| (or |g0|)
-	add	$fx, %rdx
-
-	xor	%rax, @acc[0]		# conditionally negate |a| (or |b|)
-	xor	%rax, @acc[1]
-	xor	@acc[2], %rax
-	add	$fx, @acc[0]
-	adc	\$0, @acc[1]
-	adc	\$0, %rax
-
-	mulx	@acc[0], @acc[0], $fx	# |a|*|f0| (or |b|*|g0|)
-	mulx	@acc[1], @acc[1], @acc[2]
-	add	$fx, @acc[1]
-	adc	\$0, @acc[2]
-	imulq	%rdx
-	add	%rax, @acc[2]
-	adc	\$0, %rdx
-___
-$code.=<<___	if ($j==0);
-	mov	%rdx, @acc[6]
-	mov	$g0, %rdx
-___
-}
-$code.=<<___;
-	add	@acc[0], @acc[3]
-	adc	@acc[1], @acc[4]
-	adc	@acc[2], @acc[5]
-	adc	%rdx,    @acc[6]
-	mov	$f0, %rdx
-
-	shrd	\$31, @acc[4], @acc[3]
-	shrd	\$31, @acc[5], @acc[4]
-	shrd	\$31, @acc[6], @acc[5]
-
-	sar	\$63, @acc[6]		# sign as mask
-	xor	$fx, $fx
-	sub	@acc[6], $fx		# sign as bit
-
-	xor	@acc[6], @acc[3]	# conditionally negate the result
-	xor	@acc[6], @acc[4]
-	xor	@acc[6], @acc[5]
-	add	$fx, @acc[3]
-	adc	\$0, @acc[4]
-	adc	\$0, @acc[5]
-
-	mov	@acc[3], 8*0($out_ptr)
-	mov	@acc[4], 8*1($out_ptr)
-	mov	@acc[5], 8*2($out_ptr)
-
-	xor	@acc[6], %rdx		# conditionally negate |f0|
-	xor	@acc[6], $g0		# conditionally negate |g0|
-	add	$fx, %rdx
-	add	$fx, $g0
-
-	ret
-.size	__smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31
-___
-} }
-
-{
-my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11));
-my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15");
-my ($fg0, $fg1, $bias) = ($g0, $g1, $t4);
-my ($a_, $b_) = ($a_lo, $b_lo);
-{
-my @a = ($a_lo, $t1, $a_hi);
-my @b = ($b_lo, $t2, $b_hi);
-
-$code.=<<___;
-.type	__ab_approximation_31,\@abi-omnipotent
-.align	32
-__ab_approximation_31:
-	mov	8*5($in_ptr), @a[2]	# load |a| in reverse order
-	mov	8*11($in_ptr), @b[2]	# load |b| in reverse order
-	mov	8*4($in_ptr), @a[1]
-	mov	8*10($in_ptr), @b[1]
-	mov	8*3($in_ptr), @a[0]
-	mov	8*9($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# check top-most limbs, ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	mov	8*2($in_ptr), @a[0]
-	cmovz	@b[0], @b[1]
-	mov	8*8($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... ones before top-most, ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	mov	8*1($in_ptr), @a[0]
-	cmovz	@b[0], @b[1]
-	mov	8*7($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... and ones before that ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	mov	8*0($in_ptr), @a[0]
-	cmovz	@b[0], @b[1]
-	mov	8*6($in_ptr), @b[0]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0		# ... and ones before that ...
-	cmovz	@a[1], @a[2]
-	cmovz	@b[1], @b[2]
-	cmovz	@a[0], @a[1]
-	cmovz	@b[0], @b[1]
-
-	mov	@a[2], $t0
-	or	@b[2], $t0
-	bsr	$t0, %rcx
-	lea	1(%rcx), %rcx
-	cmovz	@a[0], @a[2]
-	cmovz	@b[0], @b[2]
-	cmovz	$t0, %rcx
-	neg	%rcx
-	#and	\$63, %rcx		# debugging artefact
-
-	shldq	%cl, @a[1], @a[2]	# align second limb to the left
-	shldq	%cl, @b[1], @b[2]
-
-	mov	\$0x7FFFFFFF, %eax
-	and	%rax, @a[0]
-	and	%rax, @b[0]
-	andn	@a[2], %rax, @a[2]
-	andn	@b[2], %rax, @b[2]
-	or	@a[2], @a[0]
-	or	@b[2], @b[0]
-
-	jmp	__inner_loop_31
-
-	ret
-.size	__ab_approximation_31,.-__ab_approximation_31
-___
-}
-$code.=<<___;
-.type	__inner_loop_31,\@abi-omnipotent
-.align	32
-__inner_loop_31:		################# by Thomas Pornin
-	mov	\$0x7FFFFFFF80000000, $fg0	# |f0|=1, |g0|=0
-	mov	\$0x800000007FFFFFFF, $fg1	# |f1|=0, |g1|=1
-	mov	\$0x7FFFFFFF7FFFFFFF, $bias
-
-.Loop_31:
-	cmp	$b_, $a_		# if |a_|<|b_|, swap the variables
-	mov	$a_, $t0
-	mov	$b_, $t1
-	mov	$fg0, $t2
-	mov	$fg1, $t3
-	cmovb	$b_, $a_
-	cmovb	$t0, $b_
-	cmovb	$fg1, $fg0
-	cmovb	$t2, $fg1
-
-	sub	$b_, $a_		# |a_|-|b_|
-	sub	$fg1, $fg0		# |f0|-|f1|, |g0|-|g1|
-	add	$bias, $fg0
-
-	test	\$1, $t0		# if |a_| was even, roll back 
-	cmovz	$t0, $a_
-	cmovz	$t1, $b_
-	cmovz	$t2, $fg0
-	cmovz	$t3, $fg1
-
-	shr	\$1, $a_		# |a_|>>=1
-	add	$fg1, $fg1		# |f1|<<=1, |g1|<<=1
-	sub	$bias, $fg1
-	sub	\$1, $cnt
-	jnz	.Loop_31
-
-	shr	\$32, $bias
-	mov	%ecx, %edx		# $fg0, $f0
-	mov	${fg1}d, ${f1}d
-	shr	\$32, $g0
-	shr	\$32, $g1
-	sub	$bias, $f0		# remove the bias
-	sub	$bias, $g0
-	sub	$bias, $f1
-	sub	$bias, $g1
-
-	ret
-.size	__inner_loop_31,.-__inner_loop_31
-
-.type	__inner_loop_62,\@abi-omnipotent
-.align	32
-__inner_loop_62:
-	mov	\$1, $f0	# |f0|=1
-	xor	$g0, $g0	# |g0|=0
-	xor	$f1, $f1	# |f1|=0
-	mov	\$1, $g1	# |g1|=1
-
-.Loop_62:
-	xor	$t0, $t0
-	test	\$1, $a_lo	# if |a_| is odd, then we'll be subtracting |b_|
-	mov	$b_lo, $t1
-	cmovnz	$b_lo, $t0
-	sub	$a_lo, $t1	# |b_|-|a_|
-	mov	$a_lo, $t2
-	sub	$t0, $a_lo	# |a_|-|b_| (or |a_|-0 if |a_| was even)
-	cmovc	$t1, $a_lo	# borrow means |a_|<|b_|, replace with |b_|-|a_|
-	cmovc	$t2, $b_lo	# |b_| = |a_|
-	mov	$f0, $t0	# exchange |f0| and |f1|
-	cmovc	$f1, $f0
-	cmovc	$t0, $f1
-	mov	$g0, $t1	# exchange |g0| and |g1|
-	cmovc	$g1, $g0
-	cmovc	$t1, $g1
-	xor	$t0, $t0
-	xor	$t1, $t1
-	shr	\$1, $a_lo
-	test	\$1, $t2	# if |a_| was odd, then we'll be subtracting...
-	cmovnz	$f1, $t0
-	cmovnz	$g1, $t1
-	add	$f1, $f1	# |f1|<<=1
-	add	$g1, $g1	# |g1|<<=1
-	sub	$t0, $f0	# |f0|-=|f1| (or |f0-=0| if |a_| was even)
-	sub	$t1, $g0	# |g0|-=|g1| (or |g0-=0| ...)
-	sub	\$1, $cnt
-	jnz	.Loop_62
-
-	ret
-.size	__inner_loop_62,.-__inner_loop_62
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/div3w-armv8.pl b/crypto/blst_src/asm/div3w-armv8.pl
deleted file mode 100755
index bfa32453c3a..00000000000
--- a/crypto/blst_src/asm/div3w-armv8.pl
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-$code.=<<___;
-.text
-
-.globl	div_3_limbs
-.type	div_3_limbs,%function
-.align	5
-div_3_limbs:
-	ldp	x4,x5,[x0]	// load R
-	eor	x0,x0,x0	// Q = 0
-	mov	x3,#64		// loop counter
-	nop
-
-.Loop:
-	subs	x6,x4,x1	// R - D
-	add	x0,x0,x0	// Q <<= 1
-	sbcs	x7,x5,x2
-	add	x0,x0,#1	// Q + speculative bit
-	csel	x4,x4,x6,lo	// select between R and R - D
-	 extr	x1,x2,x1,#1	// D >>= 1
-	csel	x5,x5,x7,lo
-	 lsr	x2,x2,#1
-	sbc	x0,x0,xzr	// subtract speculative bit
-	sub	x3,x3,#1
-	cbnz	x3,.Loop
-
-	asr	x3,x0,#63	// top bit -> mask
-	add	x0,x0,x0	// Q <<= 1
-	subs	x6,x4,x1	// R - D
-	add	x0,x0,#1	// Q + specilative bit
-	sbcs	x7,x5,x2
-	sbc	x0,x0,xzr	// subtract speculative bit
-
-	orr	x0,x0,x3	// all ones if overflow
-
-	ret
-.size	div_3_limbs,.-div_3_limbs
-___
-{
-my ($div_rem, $divisor, $quot) = map("x$_",(0..2));
-my @div = map("x$_",(3..4));
-my @acc = map("x$_",(5..7));
-my @t = map("x$_",(8..11));
-
-$code.=<<___;
-.globl	quot_rem_128
-.type	quot_rem_128,%function
-.align	5
-quot_rem_128:
-	ldp	@div[0],@div[1],[$divisor]
-
-	mul	@acc[0],@div[0],$quot	// divisor[0:1} * quotient
-	umulh	@acc[1],@div[0],$quot
-	mul	@t[3],  @div[1],$quot
-	umulh	@acc[2],@div[1],$quot
-
-	ldp	@t[0],@t[1],[$div_rem]	// load 3 limbs of the dividend
-	ldr	@t[2],[$div_rem,#16]
-
-	adds	@acc[1],@acc[1],@t[3]
-	adc	@acc[2],@acc[2],xzr
-
-	subs	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
-	sbcs	@t[1],@t[1],@acc[1]
-	sbcs	@t[2],@t[2],@acc[2]
-	sbc	@acc[0],xzr,xzr		// borrow -> mask
-
-	add	$quot,$quot,@acc[0]	// if borrowed, adjust the quotient ...
-	and	@div[0],@div[0],@acc[0]
-	and	@div[1],@div[1],@acc[0]
-	adds	@t[0],@t[0],@div[0]	// ... and add divisor
-	adc	@t[1],@t[1],@div[1]
-
-	stp	@t[0],@t[1],[$div_rem]	// save 2 limbs of the remainder
-	str	$quot,[$div_rem,#16]	// and one limb of the quotient
-
-	mov	x0,$quot		// return adjusted quotient
-
-	ret
-.size	quot_rem_128,.-quot_rem_128
-
-.globl	quot_rem_64
-.type	quot_rem_64,%function
-.align	5
-quot_rem_64:
-	ldr	@div[0],[$divisor]
-	ldr	@t[0],[$div_rem]	// load 1 limb of the dividend
-
-	mul	@acc[0],@div[0],$quot	// divisor * quotient
-
-	sub	@t[0],@t[0],@acc[0]	// dividend - divisor * quotient
-
-	stp	@t[0],$quot,[$div_rem]	// save remainder and quotient
-
-	mov	x0,$quot		// return quotient
-
-	ret
-.size	quot_rem_64,.-quot_rem_64
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/div3w-x86_64.pl b/crypto/blst_src/asm/div3w-x86_64.pl
deleted file mode 100755
index b8192db8e6d..00000000000
--- a/crypto/blst_src/asm/div3w-x86_64.pl
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-$c_ref=<<'___';
-/*
- * |div_top| points at two most significant limbs of the dividend, |d_hi|
- * and |d_lo| are two most significant limbs of the divisor. If divisor
- * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|.
- * The divisor is required to be "bitwise left-aligned," and dividend's
- * top limbs to be not larger than the divisor's. The latter limitation
- * can be problematic in the first iteration of multi-precision division,
- * where in most general case the condition would have to be "smaller."
- * The subroutine considers four limbs, two of which are "overlapping,"
- * hence the name... Another way to look at it is to think of the pair
- * of the dividend's limbs being suffixed with a zero:
- *   +-------+-------+-------+
- * R |       |       |   0   |
- *   +-------+-------+-------+
- *           +-------+-------+
- * D         |       |       |
- *           +-------+-------+
- */
-limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi)
-{
-    llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0];
-    llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo;
-    limb_t Q = 0, mask;
-    size_t i;
-
-    for (i = 0; i < LIMB_BITS; i++) {
-        Q <<= 1;
-        mask = (R >= D);
-        Q |= mask;
-        R -= (D & ((llimb_t)0 - mask));
-        D >>= 1;
-    }
-
-    mask = 0 - (Q >> (LIMB_BITS - 1));   /* does it overflow? */
-
-    Q <<= 1;
-    Q |= (R >= D);
-
-    return (Q | mask);
-}
-___
-
-$code.=<<___;
-.text
-
-.globl	div_3_limbs
-.hidden	div_3_limbs
-.type	div_3_limbs,\@function,3
-.align	32
-div_3_limbs:
-	mov	(%rdi),%r8		# load R.lo
-	mov	8(%rdi),%r9		# load R.hi
-	xor	%rax,%rax		# Q = 0
-	mov	\$64,%ecx		# loop counter
-
-.Loop:
-	 mov	%r8,%r10		# put aside R
-	sub	%rsi,%r8		# R -= D
-	 mov	%r9,%r11
-	sbb	%rdx,%r9
-	lea	1(%rax,%rax),%rax	# Q <<= 1 + speculative bit
-	 mov	%rdx,%rdi
-	cmovc	%r10,%r8		# restore R if R - D borrowed
-	cmovc	%r11,%r9
-	sbb	\$0,%rax		# subtract speculative bit
-	 shl	\$63,%rdi
-	 shr	\$1,%rsi
-	 shr	\$1,%rdx
-	 or	%rdi,%rsi		# D >>= 1
-	sub	\$1,%ecx
-	jnz	.Loop
-
-	lea	1(%rax,%rax),%rcx	# Q <<= 1 + speculative bit
-	sar	\$63,%rax		# top bit -> mask
-
-	sub	%rsi,%r8		# R -= D
-	sbb	%rdx,%r9
-	sbb	\$0,%rcx		# subtract speculative bit
-
-	or	%rcx,%rax		# all ones if overflow
-
-	ret
-.size	div_3_limbs,.-div_3_limbs
-___
-########################################################################
-# Calculate remainder and adjust the quotient, which can be off-by-one.
-# Then save quotient in limb next to top limb of the remainder. There is
-# place, because the remainder/next-iteration-dividend gets shorter by
-# one limb.
-{
-my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx");
-my @acc = ("%r8", "%r9", "%rdx");
-my @tmp = ("%r10", "%r11", "%rax");
-
-$code.=<<___;
-.globl	quot_rem_128
-.hidden	quot_rem_128
-.type	quot_rem_128,\@function,3
-.align	32
-quot_rem_128:
-	mov	%rdx, %rax
-	mov	%rdx, $quotient
-
-	mulq	0($divisor)		# divisor[0:1] * quotient
-	mov	%rax, @acc[0]
-	mov	$quotient, %rax
-	mov	%rdx, @acc[1]
-
-	mulq	8($divisor)
-	add	%rax, @acc[1]
-	adc	\$0, %rdx		# %rdx is @acc[2]
-
-	mov	0($div_rem), @tmp[0]	# load 3 limbs of the dividend
-	mov	8($div_rem), @tmp[1]
-	mov	16($div_rem), @tmp[2]
-
-	sub	@acc[0], @tmp[0]	# dividend - divisor * quotient
-	sbb	@acc[1], @tmp[1]
-	sbb	@acc[2], @tmp[2]
-	sbb	@acc[0], @acc[0]	# borrow -> mask
-
-	add	@acc[0], $quotient	# if borrowed, adjust the quotient ...
-	mov	@acc[0], @acc[1]
-	and	0($divisor), @acc[0]
-	and	8($divisor), @acc[1]
-	add	@acc[0], @tmp[0]	# ... and add divisor
-	adc	@acc[1], @tmp[1]
-
-	mov	@tmp[0], 0($div_rem)	# save 2 limbs of the remainder ...
-	mov	@tmp[1], 8($div_rem)
-	mov	$quotient, 16($div_rem)	# ... and 1 limb of the quotient
-
-	mov	$quotient, %rax		# return adjusted quotient
-
-	ret
-.size	quot_rem_128,.-quot_rem_128
-
-########################################################################
-# Unlike 128-bit case above, quotient is exact. As result just one limb
-# of the dividend is sufficient to calculate the remainder...
-
-.globl	quot_rem_64
-.hidden	quot_rem_64
-.type	quot_rem_64,\@function,3
-.align	32
-quot_rem_64:
-	mov	%rdx, %rax		# return quotient
-	imulq	0($divisor), %rdx	# divisor[0] * quotient
-
-	mov	0($div_rem), @tmp[0]	# load 1 limb of the dividend
-
-	sub	%rdx, @tmp[0]		# dividend - divisor * quotient
-
-	mov	@tmp[0], 0($div_rem)	# save 1 limb of the remainder ...
-	mov	%rax, 8($div_rem)	# ... and 1 limb of the quotient
-
-	ret
-.size	quot_rem_64,.-quot_rem_64
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/mul_mont_256-armv8.pl b/crypto/blst_src/asm/mul_mont_256-armv8.pl
deleted file mode 100755
index ba6c2b87980..00000000000
--- a/crypto/blst_src/asm/mul_mont_256-armv8.pl
+++ /dev/null
@@ -1,409 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# As for "sparse" in subroutine names, see commentary in the
-# asm/mulx_mont_256-x86_64.pl module.
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
-
-@mod=map("x$_",(5..8));
-$bi="x9";
-@a=map("x$_",(10..13));
-@tmp=map("x$_",(14..17));
-@acc=map("x$_",(19..24));
-$m0=$n_ptr;
-
-$code.=<<___;
-.text
-
-.globl	mul_mont_sparse_256
-.hidden	mul_mont_sparse_256
-.type	mul_mont_sparse_256,%function
-.align	5
-mul_mont_sparse_256:
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldr	$bi,        [$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-
-	mul	@acc[0],@a[0],$bi
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	mul	@acc[1],@a[1],$bi
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	mul	@acc[2],@a[2],$bi
-	mul	@acc[3],@a[3],$bi
-
-	 umulh	@tmp[0],@a[0],$bi
-	 umulh	@tmp[1],@a[1],$bi
-	mul	$m0,$n0,@acc[0]
-	 umulh	@tmp[2],@a[2],$bi
-	 umulh	@tmp[3],@a[3],$bi
-	 adds	@acc[1],@acc[1],@tmp[0]
-	//mul	@tmp[0],@mod[0],$m0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$m0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$m0
-	 adc	@acc[4],xzr,    @tmp[3]
-	mul	@tmp[3],@mod[3],$m0
-___
-for ($i=1;$i<4;$i++) {
-$code.=<<___;
-	ldr	$bi,[$b_ptr,8*$i]
-	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$m0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$m0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$m0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$m0
-	adc	@acc[4],@acc[4],xzr
-
-	 adds	@acc[0],@acc[1],@tmp[0]
-	mul	@tmp[0],@a[0],$bi
-	 adcs	@acc[1],@acc[2],@tmp[1]
-	mul	@tmp[1],@a[1],$bi
-	 adcs	@acc[2],@acc[3],@tmp[2]
-	mul	@tmp[2],@a[2],$bi
-	 adcs	@acc[3],@acc[4],@tmp[3]
-	mul	@tmp[3],@a[3],$bi
-	 adc	@acc[4],xzr,xzr
-
-	adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@a[0],$bi
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@a[1],$bi
-	adcs	@acc[2],@acc[2],@tmp[2]
-	mul	$m0,$n0,@acc[0]
-	 umulh	@tmp[2],@a[2],$bi
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@a[3],$bi
-	adc	@acc[4],@acc[4],xzr
-
-	 adds	@acc[1],@acc[1],@tmp[0]
-	//mul	@tmp[0],@mod[0],$m0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$m0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$m0
-	 adc	@acc[4],@acc[4],@tmp[3]
-	mul	@tmp[3],@mod[3],$m0
-___
-}
-$code.=<<___;
-	subs	xzr,@acc[0],#1		//adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$m0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$m0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$m0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$m0
-	adc	@acc[4],@acc[4],xzr
-
-	 adds	@acc[0],@acc[1],@tmp[0]
-	 adcs	@acc[1],@acc[2],@tmp[1]
-	 adcs	@acc[2],@acc[3],@tmp[2]
-	 adcs	@acc[3],@acc[4],@tmp[3]
-	 adc	@acc[4],xzr,xzr
-
-	subs	@tmp[0],@acc[0],@mod[0]
-	sbcs	@tmp[1],@acc[1],@mod[1]
-	sbcs	@tmp[2],@acc[2],@mod[2]
-	sbcs	@tmp[3],@acc[3],@mod[3]
-	sbcs	xzr,    @acc[4],xzr
-
-	csel	@acc[0],@acc[0],@tmp[0],lo
-	csel	@acc[1],@acc[1],@tmp[1],lo
-	csel	@acc[2],@acc[2],@tmp[2],lo
-	csel	@acc[3],@acc[3],@tmp[3],lo
-
-	stp	@acc[0],@acc[1],[$r_ptr]
-	stp	@acc[2],@acc[3],[$r_ptr,#16]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldr	x29,[sp],#64
-	ret
-.size	mul_mont_sparse_256,.-mul_mont_sparse_256
-___
-{
-my @acc = (@a,@acc[0..3]);
-my @a = @mod;
-
-$code.=<<___;
-.globl	sqr_mont_sparse_256
-.hidden	sqr_mont_sparse_256
-.type	sqr_mont_sparse_256,%function
-.align	5
-sqr_mont_sparse_256:
-	paciasp
-	stp	x29,x30,[sp,#-48]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	mov	$n0,$n_ptr
-
-	////////////////////////////////////////////////////////////////
-	//  |  |  |  |  |  |a1*a0|  |
-	//  |  |  |  |  |a2*a0|  |  |
-	//  |  |a3*a2|a3*a0|  |  |  |
-	//  |  |  |  |a2*a1|  |  |  |
-	//  |  |  |a3*a1|  |  |  |  |
-	// *|  |  |  |  |  |  |  | 2|
-	// +|a3*a3|a2*a2|a1*a1|a0*a0|
-	//  |--+--+--+--+--+--+--+--|
-	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x]
-	//
-	//  "can't overflow" below mark carrying into high part of
-	//  multiplication result, which can't overflow, because it
-	//  can never be all ones.
-
-	mul	@acc[1],@a[1],@a[0]	// a[1]*a[0]
-	umulh	@tmp[1],@a[1],@a[0]
-	mul	@acc[2],@a[2],@a[0]	// a[2]*a[0]
-	umulh	@tmp[2],@a[2],@a[0]
-	mul	@acc[3],@a[3],@a[0]	// a[3]*a[0]
-	umulh	@acc[4],@a[3],@a[0]
-
-	adds	@acc[2],@acc[2],@tmp[1]	// accumulate high parts of multiplication
-	 mul	@tmp[0],@a[2],@a[1]	// a[2]*a[1]
-	 umulh	@tmp[1],@a[2],@a[1]
-	adcs	@acc[3],@acc[3],@tmp[2]
-	 mul	@tmp[2],@a[3],@a[1]	// a[3]*a[1]
-	 umulh	@tmp[3],@a[3],@a[1]
-	adc	@acc[4],@acc[4],xzr	// can't overflow
-
-	mul	@acc[5],@a[3],@a[2]	// a[3]*a[2]
-	umulh	@acc[6],@a[3],@a[2]
-
-	adds	@tmp[1],@tmp[1],@tmp[2]	// accumulate high parts of multiplication
-	 mul	@acc[0],@a[0],@a[0]	// a[0]*a[0]
-	adc	@tmp[2],@tmp[3],xzr	// can't overflow
-
-	adds	@acc[3],@acc[3],@tmp[0]	// accumulate low parts of multiplication
-	 umulh	@a[0],@a[0],@a[0]
-	adcs	@acc[4],@acc[4],@tmp[1]
-	 mul	@tmp[1],@a[1],@a[1]	// a[1]*a[1]
-	adcs	@acc[5],@acc[5],@tmp[2]
-	 umulh	@a[1],@a[1],@a[1]
-	adc	@acc[6],@acc[6],xzr	// can't overflow
-
-	adds	@acc[1],@acc[1],@acc[1]	// acc[1-6]*=2
-	 mul	@tmp[2],@a[2],@a[2]	// a[2]*a[2]
-	adcs	@acc[2],@acc[2],@acc[2]
-	 umulh	@a[2],@a[2],@a[2]
-	adcs	@acc[3],@acc[3],@acc[3]
-	 mul	@tmp[3],@a[3],@a[3]	// a[3]*a[3]
-	adcs	@acc[4],@acc[4],@acc[4]
-	 umulh	@a[3],@a[3],@a[3]
-	adcs	@acc[5],@acc[5],@acc[5]
-	adcs	@acc[6],@acc[6],@acc[6]
-	adc	@acc[7],xzr,xzr
-
-	adds	@acc[1],@acc[1],@a[0]	// +a[i]*a[i]
-	adcs	@acc[2],@acc[2],@tmp[1]
-	adcs	@acc[3],@acc[3],@a[1]
-	adcs	@acc[4],@acc[4],@tmp[2]
-	adcs	@acc[5],@acc[5],@a[2]
-	adcs	@acc[6],@acc[6],@tmp[3]
-	adc	@acc[7],@acc[7],@a[3]
-
-	bl	__mul_by_1_mont_256
-	ldr	x30,[x29,#8]
-
-	adds	@acc[0],@acc[0],@acc[4]	// accumulate upper half
-	adcs	@acc[1],@acc[1],@acc[5]
-	adcs	@acc[2],@acc[2],@acc[6]
-	adcs	@acc[3],@acc[3],@acc[7]
-	adc	@acc[4],xzr,xzr
-
-	subs	@tmp[0],@acc[0],@mod[0]
-	sbcs	@tmp[1],@acc[1],@mod[1]
-	sbcs	@tmp[2],@acc[2],@mod[2]
-	sbcs	@tmp[3],@acc[3],@mod[3]
-	sbcs	xzr,    @acc[4],xzr
-
-	csel	@acc[0],@acc[0],@tmp[0],lo
-	csel	@acc[1],@acc[1],@tmp[1],lo
-	csel	@acc[2],@acc[2],@tmp[2],lo
-	csel	@acc[3],@acc[3],@tmp[3],lo
-
-	stp	@acc[0],@acc[1],[$r_ptr]
-	stp	@acc[2],@acc[3],[$r_ptr,#16]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldr	x29,[sp],#48
-	autiasp
-	ret
-.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
-___
-}
-{
-my @a = (@a, $bi);
-
-$code.=<<___;
-.globl	from_mont_256
-.hidden	from_mont_256
-.type	from_mont_256,%function
-.align	5
-from_mont_256:
-	paciasp
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	$n0,$n_ptr
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-
-	bl	__mul_by_1_mont_256
-	ldr	x30,[x29,#8]
-
-	subs	@tmp[0],@a[0],@mod[0]
-	sbcs	@tmp[1],@a[1],@mod[1]
-	sbcs	@tmp[2],@a[2],@mod[2]
-	sbcs	@tmp[3],@a[3],@mod[3]
-
-	csel	@a[0],@a[0],@tmp[0],lo
-	csel	@a[1],@a[1],@tmp[1],lo
-	csel	@a[2],@a[2],@tmp[2],lo
-	csel	@a[3],@a[3],@tmp[3],lo
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ldr	x29,[sp],#16
-	autiasp
-	ret
-.size	from_mont_256,.-from_mont_256
-
-.globl	redc_mont_256
-.hidden	redc_mont_256
-.type	redc_mont_256,%function
-.align	5
-redc_mont_256:
-	paciasp
-	stp	x29,x30,[sp,#-16]!
-	add	x29,sp,#0
-
-	mov	$n0,$n_ptr
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-
-	bl	__mul_by_1_mont_256
-	ldr	x30,[x29,#8]
-
-	ldp	@tmp[0],@tmp[1],[$a_ptr,#32]
-	ldp	@tmp[2],@tmp[3],[$a_ptr,#48]
-
-	adds	@a[0],@a[0],@tmp[0]
-	adcs	@a[1],@a[1],@tmp[1]
-	adcs	@a[2],@a[2],@tmp[2]
-	adcs	@a[3],@a[3],@tmp[3]
-	adc	@a[4],xzr,xzr
-
-	subs	@tmp[0],@a[0],@mod[0]
-	sbcs	@tmp[1],@a[1],@mod[1]
-	sbcs	@tmp[2],@a[2],@mod[2]
-	sbcs	@tmp[3],@a[3],@mod[3]
-	sbcs	xzr,    @a[4],xzr
-
-	csel	@a[0],@a[0],@tmp[0],lo
-	csel	@a[1],@a[1],@tmp[1],lo
-	csel	@a[2],@a[2],@tmp[2],lo
-	csel	@a[3],@a[3],@tmp[3],lo
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-
-	ldr	x29,[sp],#16
-	autiasp
-	ret
-.size	redc_mont_256,.-redc_mont_256
-
-.type	__mul_by_1_mont_256,%function
-.align	5
-__mul_by_1_mont_256:
-	mul	$m0,$n0,@a[0]
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-___
-for ($i=1;$i<4;$i++) {
-$code.=<<___;
-	//mul	@tmp[0],@mod[0],$m0
-	mul	@tmp[1],@mod[1],$m0
-	mul	@tmp[2],@mod[2],$m0
-	mul	@tmp[3],@mod[3],$m0
-	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$m0
-	adcs	@a[1],@a[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$m0
-	adcs	@a[2],@a[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$m0
-	adcs	@a[3],@a[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$m0
-	adc	@a[4],xzr,xzr
-
-	 adds	@a[0],@a[1],@tmp[0]
-	 adcs	@a[1],@a[2],@tmp[1]
-	 adcs	@a[2],@a[3],@tmp[2]
-	mul	$m0,$n0,@a[0]
-	 adc	@a[3],@a[4],@tmp[3]
-___
-}
-$code.=<<___;
-	//mul	@tmp[0],@mod[0],$m0
-	mul	@tmp[1],@mod[1],$m0
-	mul	@tmp[2],@mod[2],$m0
-	mul	@tmp[3],@mod[3],$m0
-	subs	xzr,@a[0],#1		//adds	@a[0],@a[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$m0
-	adcs	@a[1],@a[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$m0
-	adcs	@a[2],@a[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$m0
-	adcs	@a[3],@a[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$m0
-	adc	@a[4],xzr,xzr
-
-	 adds	@a[0],@a[1],@tmp[0]
-	 adcs	@a[1],@a[2],@tmp[1]
-	 adcs	@a[2],@a[3],@tmp[2]
-	 adc	@a[3],@a[4],@tmp[3]
-
-	ret
-.size	__mul_by_1_mont_256,.-__mul_by_1_mont_256
-___
-}
-
-print $code;
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/mul_mont_384-armv8.pl b/crypto/blst_src/asm/mul_mont_384-armv8.pl
deleted file mode 100755
index 44e12a00b03..00000000000
--- a/crypto/blst_src/asm/mul_mont_384-armv8.pl
+++ /dev/null
@@ -1,2015 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4);
-
-@mod = map("x$_",(5..10));
-@a   = map("x$_",(11..16));
-$bi  = "x17";
-@acc = map("x$_",(19..25));
-@tmp = map("x$_",(26..28,0,1,3));
-
-$code.=<<___;
-.text
-
-.globl	add_mod_384x384
-.type	add_mod_384x384,%function
-.align	5
-add_mod_384x384:
-	paciasp
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__add_mod_384x384
-	ldr	x30,[x29,#8]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldr	x29,[sp],#64
-	autiasp
-	ret
-.size	add_mod_384x384,.-add_mod_384x384
-
-.type	__add_mod_384x384,%function
-.align	5
-__add_mod_384x384:
-	ldp	@a[0],  @a[1],  [$a_ptr]
-	ldp	@acc[0],@acc[1],[$b_ptr]
-	ldp	@a[2],  @a[3],  [$a_ptr,#16]
-	adds	@a[0],@a[0],@acc[0]
-	ldp	@acc[2],@acc[3],[$b_ptr,#16]
-	adcs	@a[1],@a[1],@acc[1]
-	ldp	@a[4],  @a[5],  [$a_ptr,#32]
-	adcs	@a[2],@a[2],@acc[2]
-	ldp	@acc[4],@acc[5],[$b_ptr,#32]
-	adcs	@a[3],@a[3],@acc[3]
-	 stp	@a[0],  @a[1],  [$r_ptr]
-	adcs	@a[4],@a[4],@acc[4]
-	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
-	adcs	@a[5],@a[5],@acc[5]
-
-	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
-	 stp	@a[2],  @a[3],  [$r_ptr,#16]
-	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
-	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
-
-	adcs	@a[0],@a[0],@acc[0]
-	 stp	@a[4],  @a[5],  [$r_ptr,#32]
-	adcs	@a[1],@a[1],@acc[1]
-	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
-	adcs	@a[2],@a[2],@acc[2]
-	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
-	adcs	@a[3],@a[3],@acc[3]
-	adcs	@a[4],@a[4],@acc[4]
-	adcs	@a[5],@a[5],@acc[5]
-	adc	$bi,xzr,xzr
-
-	subs	@acc[0],@a[0],@mod[0]
-	sbcs	@acc[1],@a[1],@mod[1]
-	sbcs	@acc[2],@a[2],@mod[2]
-	sbcs	@acc[3],@a[3],@mod[3]
-	sbcs	@acc[4],@a[4],@mod[4]
-	sbcs	@acc[5],@a[5],@mod[5]
-	sbcs	xzr,$bi,xzr
-
-	csel	@a[0],@a[0],@acc[0],lo
-	csel	@a[1],@a[1],@acc[1],lo
-	csel	@a[2],@a[2],@acc[2],lo
-	csel	@a[3],@a[3],@acc[3],lo
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	csel	@a[4],@a[4],@acc[4],lo
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	csel	@a[5],@a[5],@acc[5],lo
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ret
-.size	__add_mod_384x384,.-__add_mod_384x384
-
-.globl	sub_mod_384x384
-.type	sub_mod_384x384,%function
-.align	5
-sub_mod_384x384:
-	paciasp
-	stp	x29,x30,[sp,#-64]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__sub_mod_384x384
-	ldr	x30,[x29,#8]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldr	x29,[sp],#64
-	autiasp
-	ret
-.size	sub_mod_384x384,.-sub_mod_384x384
-
-.type	__sub_mod_384x384,%function
-.align	5
-__sub_mod_384x384:
-	ldp	@a[0],  @a[1],  [$a_ptr]
-	ldp	@acc[0],@acc[1],[$b_ptr]
-	ldp	@a[2],  @a[3],  [$a_ptr,#16]
-	subs	@a[0],@a[0],@acc[0]
-	ldp	@acc[2],@acc[3],[$b_ptr,#16]
-	sbcs	@a[1],@a[1],@acc[1]
-	ldp	@a[4],  @a[5],  [$a_ptr,#32]
-	sbcs	@a[2],@a[2],@acc[2]
-	ldp	@acc[4],@acc[5],[$b_ptr,#32]
-	sbcs	@a[3],@a[3],@acc[3]
-	 stp	@a[0],  @a[1],  [$r_ptr]
-	sbcs	@a[4],@a[4],@acc[4]
-	 ldp	@a[0],  @a[1],  [$a_ptr,#48]
-	sbcs	@a[5],@a[5],@acc[5]
-
-	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
-	 stp	@a[2],  @a[3],  [$r_ptr,#16]
-	 ldp	@a[2],  @a[3],  [$a_ptr,#64]
-	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
-
-	sbcs	@a[0],@a[0],@acc[0]
-	 stp	@a[4],  @a[5],  [$r_ptr,#32]
-	sbcs	@a[1],@a[1],@acc[1]
-	 ldp	@a[4],  @a[5],  [$a_ptr,#80]
-	sbcs	@a[2],@a[2],@acc[2]
-	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
-	sbcs	@a[3],@a[3],@acc[3]
-	sbcs	@a[4],@a[4],@acc[4]
-	sbcs	@a[5],@a[5],@acc[5]
-	sbc	$bi,xzr,xzr
-
-	 and	@acc[0],@mod[0],$bi
-	 and	@acc[1],@mod[1],$bi
-	adds	@a[0],@a[0],@acc[0]
-	 and	@acc[2],@mod[2],$bi
-	adcs	@a[1],@a[1],@acc[1]
-	 and	@acc[3],@mod[3],$bi
-	adcs	@a[2],@a[2],@acc[2]
-	 and	@acc[4],@mod[4],$bi
-	adcs	@a[3],@a[3],@acc[3]
-	 and	@acc[5],@mod[5],$bi
-	adcs	@a[4],@a[4],@acc[4]
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	adc	@a[5],@a[5],@acc[5]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	ret
-.size	__sub_mod_384x384,.-__sub_mod_384x384
-
-.type	__add_mod_384,%function
-.align	5
-__add_mod_384:
-	ldp	@a[0],  @a[1],  [$a_ptr]
-	ldp	@acc[0],@acc[1],[$b_ptr]
-	ldp	@a[2],  @a[3],  [$a_ptr,#16]
-	adds	@a[0],@a[0],@acc[0]
-	ldp	@acc[2],@acc[3],[$b_ptr,#16]
-	adcs	@a[1],@a[1],@acc[1]
-	ldp	@a[4],  @a[5],  [$a_ptr,#32]
-	adcs	@a[2],@a[2],@acc[2]
-	ldp	@acc[4],@acc[5],[$b_ptr,#32]
-	adcs	@a[3],@a[3],@acc[3]
-	adcs	@a[4],@a[4],@acc[4]
-	adcs	@a[5],@a[5],@acc[5]
-	adc	$bi,xzr,xzr
-
-	subs	@acc[0],@a[0],@mod[0]
-	sbcs	@acc[1],@a[1],@mod[1]
-	sbcs	@acc[2],@a[2],@mod[2]
-	sbcs	@acc[3],@a[3],@mod[3]
-	sbcs	@acc[4],@a[4],@mod[4]
-	sbcs	@acc[5],@a[5],@mod[5]
-	sbcs	xzr,$bi,xzr
-
-	csel	@a[0],@a[0],@acc[0],lo
-	csel	@a[1],@a[1],@acc[1],lo
-	csel	@a[2],@a[2],@acc[2],lo
-	csel	@a[3],@a[3],@acc[3],lo
-	csel	@a[4],@a[4],@acc[4],lo
-	stp	@a[0],@a[1],[$r_ptr]
-	csel	@a[5],@a[5],@acc[5],lo
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ret
-.size	__add_mod_384,.-__add_mod_384
-
-.type	__sub_mod_384,%function
-.align	5
-__sub_mod_384:
-	ldp	@a[0],  @a[1],  [$a_ptr]
-	ldp	@acc[0],@acc[1],[$b_ptr]
-	ldp	@a[2],  @a[3],  [$a_ptr,#16]
-	subs	@a[0],@a[0],@acc[0]
-	ldp	@acc[2],@acc[3],[$b_ptr,#16]
-	sbcs	@a[1],@a[1],@acc[1]
-	ldp	@a[4],  @a[5],  [$a_ptr,#32]
-	sbcs	@a[2],@a[2],@acc[2]
-	ldp	@acc[4],@acc[5],[$b_ptr,#32]
-	sbcs	@a[3],@a[3],@acc[3]
-	sbcs	@a[4],@a[4],@acc[4]
-	sbcs	@a[5],@a[5],@acc[5]
-	sbc	$bi,xzr,xzr
-
-	 and	@acc[0],@mod[0],$bi
-	 and	@acc[1],@mod[1],$bi
-	adds	@a[0],@a[0],@acc[0]
-	 and	@acc[2],@mod[2],$bi
-	adcs	@a[1],@a[1],@acc[1]
-	 and	@acc[3],@mod[3],$bi
-	adcs	@a[2],@a[2],@acc[2]
-	 and	@acc[4],@mod[4],$bi
-	adcs	@a[3],@a[3],@acc[3]
-	 and	@acc[5],@mod[5],$bi
-	adcs	@a[4],@a[4],@acc[4]
-	stp	@a[0],@a[1],[$r_ptr]
-	adc	@a[5],@a[5],@acc[5]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ret
-.size	__sub_mod_384,.-__sub_mod_384
-
-.globl	mul_mont_384x
-.hidden	mul_mont_384x
-.type	mul_mont_384x,%function
-.align	5
-mul_mont_384x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#288		// space for 3 768-bit vectors
-
-	mov	@tmp[0],$r_ptr		// save r_ptr
-	mov	@tmp[1],$a_ptr		// save b_ptr
-	mov	@tmp[2],$b_ptr		// save b_ptr
-
-	sub	$r_ptr,sp,#0		// mul_384(t0, a->re, b->re)
-	bl	__mul_384
-
-	add	$a_ptr,$a_ptr,#48	// mul_384(t1, a->im, b->im)
-	add	$b_ptr,$b_ptr,#48
-	add	$r_ptr,sp,#96
-	bl	__mul_384
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	sub	$b_ptr,$a_ptr,#48
-	add	$r_ptr,sp,#240
-	bl	__add_mod_384
-
-	add	$a_ptr,@tmp[2],#0
-	add	$b_ptr,@tmp[2],#48
-	add	$r_ptr,sp,#192		// t2
-	bl	__add_mod_384
-
-	add	$a_ptr,$r_ptr,#0
-	add	$b_ptr,$r_ptr,#48
-	bl	__mul_384		// mul_384(t2, a->re+a->im, b->re+b->im)
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	mov	$a_ptr,$r_ptr
-	add	$b_ptr,sp,#0
-	bl	__sub_mod_384x384
-
-	add	$b_ptr,sp,#96
-	bl	__sub_mod_384x384	// t2 = t2-t0-t1
-
-	add	$a_ptr,sp,#0
-	add	$b_ptr,sp,#96
-	add	$r_ptr,sp,#0
-	bl	__sub_mod_384x384	// t0 = t0-t1
-
-	add	$a_ptr,sp,#0		// ret->re = redc(t0)
-	add	$r_ptr,@tmp[0],#0
-	bl	__mul_by_1_mont_384
-	bl	__redc_tail_mont_384
-
-	add	$a_ptr,sp,#192		// ret->im = redc(t2)
-	add	$r_ptr,$r_ptr,#48
-	bl	__mul_by_1_mont_384
-	bl	__redc_tail_mont_384
-	ldr	x30,[x29,#8]
-
-	add	sp,sp,#288
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	mul_mont_384x,.-mul_mont_384x
-
-.globl	sqr_mont_384x
-.hidden	sqr_mont_384x
-.type	sqr_mont_384x,%function
-.align	5
-sqr_mont_384x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
-	sub	sp,sp,#96		// space for 2 384-bit vectors
-	mov	$n0,$n_ptr		// adjust for missing b_ptr
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	add	$b_ptr,$a_ptr,#48
-	add	$r_ptr,sp,#0
-	bl	__add_mod_384		// t0 = a->re + a->im
-
-	add	$r_ptr,sp,#48
-	bl	__sub_mod_384		// t1 = a->re - a->im
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldr	$bi,        [$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	bl	__mul_mont_384		// mul_mont_384(ret->im, a->re, a->im)
-
-	adds	@a[0],@a[0],@a[0]	// add with itself
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	@acc[6],xzr,xzr
-
-	subs	@acc[0],@a[0],@mod[0]
-	sbcs	@acc[1],@a[1],@mod[1]
-	sbcs	@acc[2],@a[2],@mod[2]
-	sbcs	@acc[3],@a[3],@mod[3]
-	sbcs	@acc[4],@a[4],@mod[4]
-	sbcs	@acc[5],@a[5],@mod[5]
-	sbcs	xzr,@acc[6],xzr
-
-	csel	@acc[0],@a[0],@acc[0],lo
-	csel	@acc[1],@a[1],@acc[1],lo
-	csel	@acc[2],@a[2],@acc[2],lo
-	 ldp	@a[0],@a[1],[sp]
-	csel	@acc[3],@a[3],@acc[3],lo
-	 ldr	$bi,        [sp,#48]
-	csel	@acc[4],@a[4],@acc[4],lo
-	 ldp	@a[2],@a[3],[sp,#16]
-	csel	@acc[5],@a[5],@acc[5],lo
-	 ldp	@a[4],@a[5],[sp,#32]
-
-	stp	@acc[0],@acc[1],[$b_ptr,#48]
-	stp	@acc[2],@acc[3],[$b_ptr,#64]
-	stp	@acc[4],@acc[5],[$b_ptr,#80]
-
-	add	$b_ptr,sp,#48
-	bl	__mul_mont_384		// mul_mont_384(ret->re, t0, t1)
-	ldr	x30,[x29,#8]
-
-	stp	@a[0],@a[1],[$b_ptr]
-	stp	@a[2],@a[3],[$b_ptr,#16]
-	stp	@a[4],@a[5],[$b_ptr,#32]
-
-	add	sp,sp,#96
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_mont_384x,.-sqr_mont_384x
-
-.globl	mul_mont_384
-.hidden	mul_mont_384
-.type	mul_mont_384,%function
-.align	5
-mul_mont_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldr	$bi,        [$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	bl	__mul_mont_384
-	ldr	x30,[x29,#8]
-
-	stp	@a[0],@a[1],[$b_ptr]
-	stp	@a[2],@a[3],[$b_ptr,#16]
-	stp	@a[4],@a[5],[$b_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	mul_mont_384,.-mul_mont_384
-
-.type	__mul_mont_384,%function
-.align	5
-__mul_mont_384:
-	mul	@acc[0],@a[0],$bi
-	mul	@acc[1],@a[1],$bi
-	mul	@acc[2],@a[2],$bi
-	mul	@acc[3],@a[3],$bi
-	mul	@acc[4],@a[4],$bi
-	mul	@acc[5],@a[5],$bi
-	mul	$n0,$n0,@acc[0]
-
-	 umulh	@tmp[0],@a[0],$bi
-	 umulh	@tmp[1],@a[1],$bi
-	 umulh	@tmp[2],@a[2],$bi
-	 umulh	@tmp[3],@a[3],$bi
-	 umulh	@tmp[4],@a[4],$bi
-	 umulh	@tmp[5],@a[5],$bi
-
-	 adds	@acc[1],@acc[1],@tmp[0]
-	// mul	@tmp[0],@mod[0],$n0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$n0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$n0
-	 adcs	@acc[4],@acc[4],@tmp[3]
-	mul	@tmp[3],@mod[3],$n0
-	 adcs	@acc[5],@acc[5],@tmp[4]
-	mul	@tmp[4],@mod[4],$n0
-	 adc	@acc[6],xzr,    @tmp[5]
-	mul	@tmp[5],@mod[5],$n0
-	 mov	$bi,xzr
-___
-for ($i=1;$i<6;$i++) {
-$code.=<<___;
-	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$n0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$n0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$n0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$n0
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@mod[4],$n0
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@mod[5],$n0
-	adcs	@acc[6],@acc[6],xzr
-	adc	$n0,$bi,xzr
-	ldr	$bi,[$b_ptr,8*$i]
-
-	 adds	@acc[0],@acc[1],@tmp[0]
-	mul	@tmp[0],@a[0],$bi
-	 adcs	@acc[1],@acc[2],@tmp[1]
-	mul	@tmp[1],@a[1],$bi
-	 adcs	@acc[2],@acc[3],@tmp[2]
-	mul	@tmp[2],@a[2],$bi
-	 adcs	@acc[3],@acc[4],@tmp[3]
-	mul	@tmp[3],@a[3],$bi
-	 adcs	@acc[4],@acc[5],@tmp[4]
-	mul	@tmp[4],@a[4],$bi
-	 adcs	@acc[5],@acc[6],@tmp[5]
-	mul	@tmp[5],@a[5],$bi
-	 adc	@acc[6],$n0,xzr
-	ldr	$n0,[x29,#96]
-
-	adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@a[0],$bi
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@a[1],$bi
-	adcs	@acc[2],@acc[2],@tmp[2]
-	mul	$n0,$n0,@acc[0]
-	 umulh	@tmp[2],@a[2],$bi
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@a[3],$bi
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@a[4],$bi
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@a[5],$bi
-	adcs	@acc[6],@acc[6],xzr
-	adc	$bi,xzr,xzr
-
-	 adds	@acc[1],@acc[1],@tmp[0]
-	// mul	@tmp[0],@mod[0],$n0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$n0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$n0
-	 adcs	@acc[4],@acc[4],@tmp[3]
-	mul	@tmp[3],@mod[3],$n0
-	 adcs	@acc[5],@acc[5],@tmp[4]
-	mul	@tmp[4],@mod[4],$n0
-	 adcs	@acc[6],@acc[6],@tmp[5]
-	mul	@tmp[5],@mod[5],$n0
-	 adc	$bi,$bi,xzr
-___
-}
-$code.=<<___;
-	subs	xzr,@acc[0],#1		// adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$n0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$n0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$n0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$n0
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@mod[4],$n0
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@mod[5],$n0
-	adcs	@acc[6],@acc[6],xzr
-	 ldp	$n0,$b_ptr,[x29,#96]	// pull r_ptr
-	adc	$bi,$bi,xzr
-
-	 adds	@acc[0],@acc[1],@tmp[0]
-	 adcs	@acc[1],@acc[2],@tmp[1]
-	 adcs	@acc[2],@acc[3],@tmp[2]
-	 adcs	@acc[3],@acc[4],@tmp[3]
-	 adcs	@acc[4],@acc[5],@tmp[4]
-	 adcs	@acc[5],@acc[6],@tmp[5]
-	 adc	@acc[6],$bi,xzr
-
-	subs	@tmp[0],@acc[0],@mod[0]
-	sbcs	@tmp[1],@acc[1],@mod[1]
-	sbcs	@tmp[2],@acc[2],@mod[2]
-	sbcs	@tmp[3],@acc[3],@mod[3]
-	sbcs	@tmp[4],@acc[4],@mod[4]
-	sbcs	@tmp[5],@acc[5],@mod[5]
-	sbcs	xzr,    @acc[6],xzr
-
-	csel	@a[0],@acc[0],@tmp[0],lo
-	csel	@a[1],@acc[1],@tmp[1],lo
-	csel	@a[2],@acc[2],@tmp[2],lo
-	csel	@a[3],@acc[3],@tmp[3],lo
-	csel	@a[4],@acc[4],@tmp[4],lo
-	csel	@a[5],@acc[5],@tmp[5],lo
-	ret
-.size	__mul_mont_384,.-__mul_mont_384
-
-.globl	sqr_mont_384
-.hidden	sqr_mont_384
-.type	sqr_mont_384,%function
-.align	5
-sqr_mont_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#96		// space for 768-bit vector
-	mov	$n0,$n_ptr		// adjust for missing b_ptr
-
-	mov	$n_ptr,$r_ptr		// save r_ptr
-	mov	$r_ptr,sp
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	bl	__sqr_384
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	mov	$a_ptr,sp
-	mov	$r_ptr,$n_ptr		// restore r_ptr
-	bl	__mul_by_1_mont_384
-	bl	__redc_tail_mont_384
-	ldr	x30,[x29,#8]
-
-	add	sp,sp,#96
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_mont_384,.-sqr_mont_384
-
-.globl	sqr_n_mul_mont_383
-.hidden	sqr_n_mul_mont_383
-.type	sqr_n_mul_mont_383,%function
-.align	5
-sqr_n_mul_mont_383:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	$n0,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
-	sub	sp,sp,#96		// space for 768-bit vector
-	mov	$bi,x5			// save b_ptr
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	mov	$r_ptr,sp
-.Loop_sqr_383:
-	bl	__sqr_384
-	sub	$b_ptr,$b_ptr,#1	// counter
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	mov	$a_ptr,sp
-	bl	__mul_by_1_mont_384
-
-	ldp	@acc[0],@acc[1],[$a_ptr,#48]
-	ldp	@acc[2],@acc[3],[$a_ptr,#64]
-	ldp	@acc[4],@acc[5],[$a_ptr,#80]
-
-	adds	@a[0],@a[0],@acc[0]	// just accumulate upper half
-	adcs	@a[1],@a[1],@acc[1]
-	adcs	@a[2],@a[2],@acc[2]
-	adcs	@a[3],@a[3],@acc[3]
-	adcs	@a[4],@a[4],@acc[4]
-	adc	@a[5],@a[5],@acc[5]
-
-	cbnz	$b_ptr,.Loop_sqr_383
-
-	mov	$b_ptr,$bi
-	ldr	$bi,[$bi]
-	bl	__mul_mont_384
-	ldr	x30,[x29,#8]
-
-	stp	@a[0],@a[1],[$b_ptr]
-	stp	@a[2],@a[3],[$b_ptr,#16]
-	stp	@a[4],@a[5],[$b_ptr,#32]
-
-	add	sp,sp,#96
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
-___
-{
-my @acc=(@acc,@tmp[0..2]);
-
-$code.=<<___;
-.type	__sqr_384,%function
-.align	5
-__sqr_384:
-	mul	@acc[0],@a[1],@a[0]
-	mul	@acc[1],@a[2],@a[0]
-	mul	@acc[2],@a[3],@a[0]
-	mul	@acc[3],@a[4],@a[0]
-	mul	@acc[4],@a[5],@a[0]
-
-	 umulh	@mod[1],@a[1],@a[0]
-	 umulh	@mod[2],@a[2],@a[0]
-	 umulh	@mod[3],@a[3],@a[0]
-	 umulh	@mod[4],@a[4],@a[0]
-	 adds	@acc[1],@acc[1],@mod[1]
-	 umulh	@mod[5],@a[5],@a[0]
-	 adcs	@acc[2],@acc[2],@mod[2]
-	mul	@mod[2],@a[2],@a[1]
-	 adcs	@acc[3],@acc[3],@mod[3]
-	mul	@mod[3],@a[3],@a[1]
-	 adcs	@acc[4],@acc[4],@mod[4]
-	mul	@mod[4],@a[4],@a[1]
-	 adc	@acc[5],xzr,    @mod[5]
-	mul	@mod[5],@a[5],@a[1]
-
-	adds	@acc[2],@acc[2],@mod[2]
-	 umulh	@mod[2],@a[2],@a[1]
-	adcs	@acc[3],@acc[3],@mod[3]
-	 umulh	@mod[3],@a[3],@a[1]
-	adcs	@acc[4],@acc[4],@mod[4]
-	 umulh	@mod[4],@a[4],@a[1]
-	adcs	@acc[5],@acc[5],@mod[5]
-	 umulh	@mod[5],@a[5],@a[1]
-	adc	@acc[6],xzr,xzr
-
-	  mul	@mod[0],@a[0],@a[0]
-	 adds	@acc[3],@acc[3],@mod[2]
-	  umulh	@a[0],  @a[0],@a[0]
-	 adcs	@acc[4],@acc[4],@mod[3]
-	mul	@mod[3],@a[3],@a[2]
-	 adcs	@acc[5],@acc[5],@mod[4]
-	mul	@mod[4],@a[4],@a[2]
-	 adc	@acc[6],@acc[6],@mod[5]
-	mul	@mod[5],@a[5],@a[2]
-
-	adds	@acc[4],@acc[4],@mod[3]
-	 umulh	@mod[3],@a[3],@a[2]
-	adcs	@acc[5],@acc[5],@mod[4]
-	 umulh	@mod[4],@a[4],@a[2]
-	adcs	@acc[6],@acc[6],@mod[5]
-	 umulh	@mod[5],@a[5],@a[2]
-	adc	@acc[7],xzr,xzr
-
-	  mul	@mod[1],@a[1],@a[1]
-	 adds	@acc[5],@acc[5],@mod[3]
-	  umulh	@a[1],  @a[1],@a[1]
-	 adcs	@acc[6],@acc[6],@mod[4]
-	mul	@mod[4],@a[4],@a[3]
-	 adc	@acc[7],@acc[7],@mod[5]
-	mul	@mod[5],@a[5],@a[3]
-
-	adds	@acc[6],@acc[6],@mod[4]
-	 umulh	@mod[4],@a[4],@a[3]
-	adcs	@acc[7],@acc[7],@mod[5]
-	 umulh	@mod[5],@a[5],@a[3]
-	adc	@acc[8],xzr,xzr
-	  mul	@mod[2],@a[2],@a[2]
-	 adds	@acc[7],@acc[7],@mod[4]
-	  umulh	@a[2],  @a[2],@a[2]
-	 adc	@acc[8],@acc[8],@mod[5]
-	  mul	@mod[3],@a[3],@a[3]
-
-	mul	@mod[5],@a[5],@a[4]
-	  umulh	@a[3],  @a[3],@a[3]
-	adds	@acc[8],@acc[8],@mod[5]
-	 umulh	@mod[5],@a[5],@a[4]
-	  mul	@mod[4],@a[4],@a[4]
-	adc	@acc[9],@mod[5],xzr
-
-	adds	@acc[0],@acc[0],@acc[0]
-	adcs	@acc[1],@acc[1],@acc[1]
-	adcs	@acc[2],@acc[2],@acc[2]
-	adcs	@acc[3],@acc[3],@acc[3]
-	adcs	@acc[4],@acc[4],@acc[4]
-	adcs	@acc[5],@acc[5],@acc[5]
-	adcs	@acc[6],@acc[6],@acc[6]
-	adcs	@acc[7],@acc[7],@acc[7]
-	  umulh	@a[4],  @a[4],@a[4]
-	adcs	@acc[8],@acc[8],@acc[8]
-	  mul	@mod[5],@a[5],@a[5]
-	adcs	@acc[9],@acc[9],@acc[9]
-	  umulh	@a[5],  @a[5],@a[5]
-	adc	$a_ptr,xzr,xzr
-
-	adds	@acc[0],@acc[0],@a[0]
-	adcs	@acc[1],@acc[1],@mod[1]
-	adcs	@acc[2],@acc[2],@a[1]
-	adcs	@acc[3],@acc[3],@mod[2]
-	adcs	@acc[4],@acc[4],@a[2]
-	adcs	@acc[5],@acc[5],@mod[3]
-	adcs	@acc[6],@acc[6],@a[3]
-	stp	@mod[0],@acc[0],[$r_ptr]
-	adcs	@acc[7],@acc[7],@mod[4]
-	stp	@acc[1],@acc[2],[$r_ptr,#16]
-	adcs	@acc[8],@acc[8],@a[4]
-	stp	@acc[3],@acc[4],[$r_ptr,#32]
-	adcs	@acc[9],@acc[9],@mod[5]
-	stp	@acc[5],@acc[6],[$r_ptr,#48]
-	adc	@a[5],@a[5],$a_ptr
-	stp	@acc[7],@acc[8],[$r_ptr,#64]
-	stp	@acc[9],@a[5],[$r_ptr,#80]
-
-	ret
-.size	__sqr_384,.-__sqr_384
-___
-}
-$code.=<<___;
-.globl	sqr_384
-.hidden	sqr_384
-.type	sqr_384,%function
-.align	5
-sqr_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	bl	__sqr_384
-	ldr	x30,[x29,#8]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_384,.-sqr_384
-
-.globl	redc_mont_384
-.hidden	redc_mont_384
-.type	redc_mont_384,%function
-.align	5
-redc_mont_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	mov	$n0,$n_ptr		// adjust for missing b_ptr
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__mul_by_1_mont_384
-	bl	__redc_tail_mont_384
-	ldr	x30,[x29,#8]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	redc_mont_384,.-redc_mont_384
-
-.globl	from_mont_384
-.hidden	from_mont_384
-.type	from_mont_384,%function
-.align	5
-from_mont_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	mov	$n0,$n_ptr		// adjust for missing b_ptr
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	bl	__mul_by_1_mont_384
-	ldr	x30,[x29,#8]
-
-	subs	@acc[0],@a[0],@mod[0]
-	sbcs	@acc[1],@a[1],@mod[1]
-	sbcs	@acc[2],@a[2],@mod[2]
-	sbcs	@acc[3],@a[3],@mod[3]
-	sbcs	@acc[4],@a[4],@mod[4]
-	sbcs	@acc[5],@a[5],@mod[5]
-
-	csel	@a[0],@a[0],@acc[0],lo
-	csel	@a[1],@a[1],@acc[1],lo
-	csel	@a[2],@a[2],@acc[2],lo
-	csel	@a[3],@a[3],@acc[3],lo
-	csel	@a[4],@a[4],@acc[4],lo
-	csel	@a[5],@a[5],@acc[5],lo
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	from_mont_384,.-from_mont_384
-
-.type	__mul_by_1_mont_384,%function
-.align	5
-__mul_by_1_mont_384:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	mul	@tmp[0],$n0,@a[0]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	// mul	@acc[0],@mod[0],@tmp[0]
-	mul	@acc[1],@mod[1],@tmp[0]
-	mul	@acc[2],@mod[2],@tmp[0]
-	mul	@acc[3],@mod[3],@tmp[0]
-	mul	@acc[4],@mod[4],@tmp[0]
-	mul	@acc[5],@mod[5],@tmp[0]
-	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
-	 umulh	@a[0],@mod[0],@tmp[0]
-	adcs	@acc[1],@acc[1],@a[1]
-	 umulh	@a[1],@mod[1],@tmp[0]
-	adcs	@acc[2],@acc[2],@a[2]
-	 umulh	@a[2],@mod[2],@tmp[0]
-	adcs	@acc[3],@acc[3],@a[3]
-	 umulh	@a[3],@mod[3],@tmp[0]
-	adcs	@acc[4],@acc[4],@a[4]
-	 umulh	@a[4],@mod[4],@tmp[0]
-	adcs	@acc[5],@acc[5],@a[5]
-	 umulh	@a[5],@mod[5],@tmp[0]
-	adc	@acc[6],xzr,xzr
-___
-for ($i=1;$i<6;$i++) {
-$code.=<<___;
-	 adds	@a[0],@a[0],@acc[1]
-	 adcs	@a[1],@a[1],@acc[2]
-	 adcs	@a[2],@a[2],@acc[3]
-	mul	@tmp[0],$n0,@a[0]
-	 adcs	@a[3],@a[3],@acc[4]
-	 adcs	@a[4],@a[4],@acc[5]
-	 adc	@a[5],@a[5],@acc[6]
-
-	// mul	@acc[0],@mod[0],@tmp[0]
-	mul	@acc[1],@mod[1],@tmp[0]
-	mul	@acc[2],@mod[2],@tmp[0]
-	mul	@acc[3],@mod[3],@tmp[0]
-	mul	@acc[4],@mod[4],@tmp[0]
-	mul	@acc[5],@mod[5],@tmp[0]
-	subs	xzr,@a[0],#1		// adds	@acc[0],@acc[0],@a[0]
-	 umulh	@a[0],@mod[0],@tmp[0]
-	adcs	@acc[1],@acc[1],@a[1]
-	 umulh	@a[1],@mod[1],@tmp[0]
-	adcs	@acc[2],@acc[2],@a[2]
-	 umulh	@a[2],@mod[2],@tmp[0]
-	adcs	@acc[3],@acc[3],@a[3]
-	 umulh	@a[3],@mod[3],@tmp[0]
-	adcs	@acc[4],@acc[4],@a[4]
-	 umulh	@a[4],@mod[4],@tmp[0]
-	adcs	@acc[5],@acc[5],@a[5]
-	 umulh	@a[5],@mod[5],@tmp[0]
-	adc	@acc[6],xzr,xzr
-___
-}
-$code.=<<___;
-	adds	@a[0],@a[0],@acc[1]
-	adcs	@a[1],@a[1],@acc[2]
-	adcs	@a[2],@a[2],@acc[3]
-	adcs	@a[3],@a[3],@acc[4]
-	adcs	@a[4],@a[4],@acc[5]
-	adc	@a[5],@a[5],@acc[6]
-
-	ret
-.size	__mul_by_1_mont_384,.-__mul_by_1_mont_384
-
-.type	__redc_tail_mont_384,%function
-.align	5
-__redc_tail_mont_384:
-	ldp	@acc[0],@acc[1],[$a_ptr,#48]
-	ldp	@acc[2],@acc[3],[$a_ptr,#64]
-	ldp	@acc[4],@acc[5],[$a_ptr,#80]
-
-	adds	@a[0],@a[0],@acc[0]	// accumulate upper half
-	adcs	@a[1],@a[1],@acc[1]
-	adcs	@a[2],@a[2],@acc[2]
-	adcs	@a[3],@a[3],@acc[3]
-	adcs	@a[4],@a[4],@acc[4]
-	adcs	@a[5],@a[5],@acc[5]
-	adc	@acc[6],xzr,xzr
-
-	subs	@acc[0],@a[0],@mod[0]
-	sbcs	@acc[1],@a[1],@mod[1]
-	sbcs	@acc[2],@a[2],@mod[2]
-	sbcs	@acc[3],@a[3],@mod[3]
-	sbcs	@acc[4],@a[4],@mod[4]
-	sbcs	@acc[5],@a[5],@mod[5]
-	sbcs	xzr,@acc[6],xzr
-
-	csel	@a[0],@a[0],@acc[0],lo
-	csel	@a[1],@a[1],@acc[1],lo
-	csel	@a[2],@a[2],@acc[2],lo
-	csel	@a[3],@a[3],@acc[3],lo
-	csel	@a[4],@a[4],@acc[4],lo
-	csel	@a[5],@a[5],@acc[5],lo
-
-	stp	@a[0],@a[1],[$r_ptr]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-
-	ret
-.size	__redc_tail_mont_384,.-__redc_tail_mont_384
-
-.globl	mul_384
-.hidden	mul_384
-.type	mul_384,%function
-.align	5
-mul_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	bl	__mul_384
-	ldr	x30,[x29,#8]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	mul_384,.-mul_384
-
-.type	__mul_384,%function
-.align	5
-__mul_384:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldr	$bi,        [$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	mul	@acc[0],@a[0],$bi
-	mul	@acc[1],@a[1],$bi
-	mul	@acc[2],@a[2],$bi
-	mul	@acc[3],@a[3],$bi
-	mul	@acc[4],@a[4],$bi
-	mul	@acc[5],@a[5],$bi
-
-	 umulh	@mod[0],@a[0],$bi
-	 umulh	@mod[1],@a[1],$bi
-	 umulh	@mod[2],@a[2],$bi
-	 umulh	@mod[3],@a[3],$bi
-	 umulh	@mod[4],@a[4],$bi
-	 umulh	@mod[5],@a[5],$bi
-	ldr	$bi,[$b_ptr,8*1]
-
-	str	@acc[0],[$r_ptr]
-	 adds	@acc[0],@acc[1],@mod[0]
-	mul	@mod[0],@a[0],$bi
-	 adcs	@acc[1],@acc[2],@mod[1]
-	mul	@mod[1],@a[1],$bi
-	 adcs	@acc[2],@acc[3],@mod[2]
-	mul	@mod[2],@a[2],$bi
-	 adcs	@acc[3],@acc[4],@mod[3]
-	mul	@mod[3],@a[3],$bi
-	 adcs	@acc[4],@acc[5],@mod[4]
-	mul	@mod[4],@a[4],$bi
-	 adc	@acc[5],xzr,    @mod[5]
-	mul	@mod[5],@a[5],$bi
-___
-for ($i=1;$i<5;$i++) {
-$code.=<<___;
-	adds	@acc[0],@acc[0],@mod[0]
-	 umulh	@mod[0],@a[0],$bi
-	adcs	@acc[1],@acc[1],@mod[1]
-	 umulh	@mod[1],@a[1],$bi
-	adcs	@acc[2],@acc[2],@mod[2]
-	 umulh	@mod[2],@a[2],$bi
-	adcs	@acc[3],@acc[3],@mod[3]
-	 umulh	@mod[3],@a[3],$bi
-	adcs	@acc[4],@acc[4],@mod[4]
-	 umulh	@mod[4],@a[4],$bi
-	adcs	@acc[5],@acc[5],@mod[5]
-	 umulh	@mod[5],@a[5],$bi
-	ldr	$bi,[$b_ptr,#8*($i+1)]
-	adc	@acc[6],xzr,xzr
-
-	str	@acc[0],[$r_ptr,8*$i]
-	 adds	@acc[0],@acc[1],@mod[0]
-	mul	@mod[0],@a[0],$bi
-	 adcs	@acc[1],@acc[2],@mod[1]
-	mul	@mod[1],@a[1],$bi
-	 adcs	@acc[2],@acc[3],@mod[2]
-	mul	@mod[2],@a[2],$bi
-	 adcs	@acc[3],@acc[4],@mod[3]
-	mul	@mod[3],@a[3],$bi
-	 adcs	@acc[4],@acc[5],@mod[4]
-	mul	@mod[4],@a[4],$bi
-	 adc	@acc[5],@acc[6],@mod[5]
-	mul	@mod[5],@a[5],$bi
-___
-}
-$code.=<<___;
-	adds	@acc[0],@acc[0],@mod[0]
-	 umulh	@mod[0],@a[0],$bi
-	adcs	@acc[1],@acc[1],@mod[1]
-	 umulh	@mod[1],@a[1],$bi
-	adcs	@acc[2],@acc[2],@mod[2]
-	 umulh	@mod[2],@a[2],$bi
-	adcs	@acc[3],@acc[3],@mod[3]
-	 umulh	@mod[3],@a[3],$bi
-	adcs	@acc[4],@acc[4],@mod[4]
-	 umulh	@mod[4],@a[4],$bi
-	adcs	@acc[5],@acc[5],@mod[5]
-	 umulh	@mod[5],@a[5],$bi
-	adc	@acc[6],xzr,xzr
-
-	str	@acc[0],[$r_ptr,8*$i]
-	 adds	@acc[0],@acc[1],@mod[0]
-	 adcs	@acc[1],@acc[2],@mod[1]
-	 adcs	@acc[2],@acc[3],@mod[2]
-	 adcs	@acc[3],@acc[4],@mod[3]
-	 adcs	@acc[4],@acc[5],@mod[4]
-	 adc	@acc[5],@acc[6],@mod[5]
-
-	stp	@acc[0],@acc[1],[$r_ptr,#48]
-	stp	@acc[2],@acc[3],[$r_ptr,#64]
-	stp	@acc[4],@acc[5],[$r_ptr,#80]
-
-	ret
-.size	__mul_384,.-__mul_384
-
-.globl	mul_382x
-.hidden	mul_382x
-.type	mul_382x,%function
-.align	5
-mul_382x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#96		// space for two 384-bit vectors
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	mov	@tmp[0],$r_ptr		// save r_ptr
-	ldp	@acc[0],@acc[1],[$a_ptr,#48]
-	mov	@tmp[1],$a_ptr		// save a_ptr
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	mov	@tmp[2],$b_ptr		// save b_ptr
-	ldp	@acc[2],@acc[3],[$a_ptr,#64]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
-	ldp	@acc[4],@acc[5],[$a_ptr,#80]
-	adcs	@mod[1],$a[1],@acc[1]
-	 ldp	@a[0],@a[1],[$b_ptr]
-	adcs	@mod[2],$a[2],@acc[2]
-	 ldp	@acc[0],@acc[1],[$b_ptr,#48]
-	adcs	@mod[3],$a[3],@acc[3]
-	 ldp	@a[2],@a[3],[$b_ptr,#16]
-	adcs	@mod[4],$a[4],@acc[4]
-	 ldp	@acc[2],@acc[3],[$b_ptr,#64]
-	adc	@mod[5],$a[5],@acc[5]
-	 ldp	@a[4],@a[5],[$b_ptr,#32]
-
-	stp	@mod[0],@mod[1],[sp]
-	 adds	@mod[0],$a[0],@acc[0]	// t1 = b->re + b->im
-	 ldp	@acc[4],@acc[5],[$b_ptr,#80]
-	 adcs	@mod[1],$a[1],@acc[1]
-	stp	@mod[2],@mod[3],[sp,#16]
-	 adcs	@mod[2],$a[2],@acc[2]
-	 adcs	@mod[3],$a[3],@acc[3]
-	 stp	@mod[4],@mod[5],[sp,#32]
-	 adcs	@mod[4],$a[4],@acc[4]
-	 stp	@mod[0],@mod[1],[sp,#48]
-	 adc	@mod[5],$a[5],@acc[5]
-	 stp	@mod[2],@mod[3],[sp,#64]
-	 stp	@mod[4],@mod[5],[sp,#80]
-
-	bl	__mul_384		// mul_384(ret->re, a->re, b->re)
-
-	add	$a_ptr,sp,#0		// mul_384(ret->im, t0, t1)
-	add	$b_ptr,sp,#48
-	add	$r_ptr,@tmp[0],#96
-	bl	__mul_384
-
-	add	$a_ptr,@tmp[1],#48	// mul_384(tx, a->im, b->im)
-	add	$b_ptr,@tmp[2],#48
-	add	$r_ptr,sp,#0
-	bl	__mul_384
-
-	ldp	@mod[0],@mod[1],[$n_ptr]
-	ldp	@mod[2],@mod[3],[$n_ptr,#16]
-	ldp	@mod[4],@mod[5],[$n_ptr,#32]
-
-	add	$a_ptr,@tmp[0],#96	// ret->im -= tx
-	add	$b_ptr,sp,#0
-	add	$r_ptr,@tmp[0],#96
-	bl	__sub_mod_384x384
-
-	add	$b_ptr,@tmp[0],#0	// ret->im -= ret->re
-	bl	__sub_mod_384x384
-
-	add	$a_ptr,@tmp[0],#0	// ret->re -= tx
-	add	$b_ptr,sp,#0
-	add	$r_ptr,@tmp[0],#0
-	bl	__sub_mod_384x384
-	ldr	x30,[x29,#8]
-
-	add	sp,sp,#96
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	mul_382x,.-mul_382x
-
-.globl	sqr_382x
-.hidden	sqr_382x
-.type	sqr_382x,%function
-.align	5
-sqr_382x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@acc[0],@acc[1],[$a_ptr,#48]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	adds	@mod[0],$a[0],@acc[0]	// t0 = a->re + a->im
-	ldp	@acc[2],@acc[3],[$a_ptr,#64]
-	adcs	@mod[1],$a[1],@acc[1]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	adcs	@mod[2],$a[2],@acc[2]
-	ldp	@acc[4],@acc[5],[$a_ptr,#80]
-	adcs	@mod[3],$a[3],@acc[3]
-	stp	@mod[0],@mod[1],[$r_ptr]
-	adcs	@mod[4],$a[4],@acc[4]
-	 ldp	@mod[0],@mod[1],[$b_ptr]
-	adc	@mod[5],$a[5],@acc[5]
-	stp	@mod[2],@mod[3],[$r_ptr,#16]
-
-	subs	@a[0],$a[0],@acc[0]	// t1 = a->re - a->im
-	 ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	sbcs	@a[1],$a[1],@acc[1]
-	stp	@mod[4],@mod[5],[$r_ptr,#32]
-	sbcs	@a[2],$a[2],@acc[2]
-	 ldp	@mod[4],@mod[5],[$b_ptr,#32]
-	sbcs	@a[3],$a[3],@acc[3]
-	sbcs	@a[4],$a[4],@acc[4]
-	sbcs	@a[5],$a[5],@acc[5]
-	sbc	@acc[6],xzr,xzr
-
-	 and	@acc[0],@mod[0],@acc[6]
-	 and	@acc[1],@mod[1],@acc[6]
-	adds	@a[0],@a[0],@acc[0]
-	 and	@acc[2],@mod[2],@acc[6]
-	adcs	@a[1],@a[1],@acc[1]
-	 and	@acc[3],@mod[3],@acc[6]
-	adcs	@a[2],@a[2],@acc[2]
-	 and	@acc[4],@mod[4],@acc[6]
-	adcs	@a[3],@a[3],@acc[3]
-	 and	@acc[5],@mod[5],@acc[6]
-	adcs	@a[4],@a[4],@acc[4]
-	stp	@a[0],@a[1],[$r_ptr,#48]
-	adc	@a[5],@a[5],@acc[5]
-	stp	@a[2],@a[3],[$r_ptr,#64]
-	stp	@a[4],@a[5],[$r_ptr,#80]
-
-	mov	$n0,$a_ptr		// save a_ptr
-	add	$a_ptr,$r_ptr,#0	// mul_384(ret->re, t0, t1)
-	add	$b_ptr,$r_ptr,#48
-	bl	__mul_384
-
-	add	$a_ptr,$n0,#0		// mul_384(ret->im, a->re, a->im)
-	add	$b_ptr,$n0,#48
-	add	$r_ptr,$r_ptr,#96
-	bl	__mul_384
-	ldr	x30,[x29,#8]
-
-	ldp	@a[0],@a[1],[$r_ptr]
-	ldp	@a[2],@a[3],[$r_ptr,#16]
-	adds	@a[0],@a[0],@a[0]	// add with itself
-	ldp	@a[4],@a[5],[$r_ptr,#32]
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adcs	@acc[0],@acc[0],@acc[0]
-	adcs	@acc[1],@acc[1],@acc[1]
-	stp	@a[0],@a[1],[$r_ptr]
-	adcs	@acc[2],@acc[2],@acc[2]
-	stp	@a[2],@a[3],[$r_ptr,#16]
-	adcs	@acc[3],@acc[3],@acc[3]
-	stp	@a[4],@a[5],[$r_ptr,#32]
-	adcs	@acc[4],@acc[4],@acc[4]
-	stp	@acc[0],@acc[1],[$r_ptr,#48]
-	adc	@acc[5],@acc[5],@acc[5]
-	stp	@acc[2],@acc[3],[$r_ptr,#64]
-	stp	@acc[4],@acc[5],[$r_ptr,#80]
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_382x,.-sqr_382x
-
-.globl	sqr_mont_382x
-.hidden	sqr_mont_382x
-.type	sqr_mont_382x,%function
-.align	5
-sqr_mont_382x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	stp	$n_ptr,$r_ptr,[sp,#96]	// __mul_mont_384 wants them there
-	sub	sp,sp,#112		// space for two 384-bit vectors + word
-	mov	$n0,$n_ptr		// adjust for missing b_ptr
-
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-
-	ldp	$bi,@acc[1],[$a_ptr,#48]
-	ldp	@acc[2],@acc[3],[$a_ptr,#64]
-	ldp	@acc[4],@acc[5],[$a_ptr,#80]
-
-	adds	@mod[0],$a[0],$bi	// t0 = a->re + a->im
-	adcs	@mod[1],$a[1],@acc[1]
-	adcs	@mod[2],$a[2],@acc[2]
-	adcs	@mod[3],$a[3],@acc[3]
-	adcs	@mod[4],$a[4],@acc[4]
-	adc	@mod[5],$a[5],@acc[5]
-
-	subs	@acc[0],$a[0],$bi	// t1 = a->re - a->im
-	sbcs	@acc[1],$a[1],@acc[1]
-	sbcs	@acc[2],$a[2],@acc[2]
-	sbcs	@acc[3],$a[3],@acc[3]
-	sbcs	@acc[4],$a[4],@acc[4]
-	sbcs	@acc[5],$a[5],@acc[5]
-	sbc	@acc[6],xzr,xzr		// borrow flag as mask
-
-	stp	@mod[0],@mod[1],[sp]
-	stp	@mod[2],@mod[3],[sp,#16]
-	stp	@mod[4],@mod[5],[sp,#32]
-	stp	@acc[0],@acc[1],[sp,#48]
-	stp	@acc[2],@acc[3],[sp,#64]
-	stp	@acc[4],@acc[5],[sp,#80]
-	str	@acc[6],[sp,#96]
-
-	ldp	@mod[0],@mod[1],[$b_ptr]
-	ldp	@mod[2],@mod[3],[$b_ptr,#16]
-	ldp	@mod[4],@mod[5],[$b_ptr,#32]
-
-	add	$b_ptr,$a_ptr,#48
-	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, a->re, a->im)
-
-	adds	@acc[0],@a[0],@a[0]	// add with itself
-	adcs	@acc[1],@a[1],@a[1]
-	adcs	@acc[2],@a[2],@a[2]
-	adcs	@acc[3],@a[3],@a[3]
-	adcs	@acc[4],@a[4],@a[4]
-	adc	@acc[5],@a[5],@a[5]
-
-	stp	@acc[0],@acc[1],[$b_ptr,#48]
-	stp	@acc[2],@acc[3],[$b_ptr,#64]
-	stp	@acc[4],@acc[5],[$b_ptr,#80]
-
-	ldp	@a[0],@a[1],[sp]
-	ldr	$bi,[sp,#48]
-	ldp	@a[2],@a[3],[sp,#16]
-	ldp	@a[4],@a[5],[sp,#32]
-
-	add	$b_ptr,sp,#48
-	bl	__mul_mont_383_nonred	// mul_mont_384(ret->im, t0, t1)
-	ldr	x30,[x29,#8]
-
-	ldr	@acc[6],[sp,#96]	// account for sign from a->re - a->im
-	ldp	@acc[0],@acc[1],[sp]
-	ldp	@acc[2],@acc[3],[sp,#16]
-	ldp	@acc[4],@acc[5],[sp,#32]
-
-	and	@acc[0],@acc[0],@acc[6]
-	and	@acc[1],@acc[1],@acc[6]
-	and	@acc[2],@acc[2],@acc[6]
-	and	@acc[3],@acc[3],@acc[6]
-	and	@acc[4],@acc[4],@acc[6]
-	and	@acc[5],@acc[5],@acc[6]
-
-	subs	@a[0],@a[0],@acc[0]
-	sbcs	@a[1],@a[1],@acc[1]
-	sbcs	@a[2],@a[2],@acc[2]
-	sbcs	@a[3],@a[3],@acc[3]
-	sbcs	@a[4],@a[4],@acc[4]
-	sbcs	@a[5],@a[5],@acc[5]
-	sbc	@acc[6],xzr,xzr
-
-	and	@acc[0],@mod[0],@acc[6]
-	and	@acc[1],@mod[1],@acc[6]
-	and	@acc[2],@mod[2],@acc[6]
-	and	@acc[3],@mod[3],@acc[6]
-	and	@acc[4],@mod[4],@acc[6]
-	and	@acc[5],@mod[5],@acc[6]
-
-	adds	@a[0],@a[0],@acc[0]
-	adcs	@a[1],@a[1],@acc[1]
-	adcs	@a[2],@a[2],@acc[2]
-	adcs	@a[3],@a[3],@acc[3]
-	adcs	@a[4],@a[4],@acc[4]
-	adc	@a[5],@a[5],@acc[5]
-
-	stp	@a[0],@a[1],[$b_ptr]
-	stp	@a[2],@a[3],[$b_ptr,#16]
-	stp	@a[4],@a[5],[$b_ptr,#32]
-
-	add	sp,sp,#112
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sqr_mont_382x,.-sqr_mont_382x
-
-.type	__mul_mont_383_nonred,%function
-.align	5
-__mul_mont_383_nonred:
-	mul	@acc[0],@a[0],$bi
-	mul	@acc[1],@a[1],$bi
-	mul	@acc[2],@a[2],$bi
-	mul	@acc[3],@a[3],$bi
-	mul	@acc[4],@a[4],$bi
-	mul	@acc[5],@a[5],$bi
-	mul	$n0,$n0,@acc[0]
-
-	 umulh	@tmp[0],@a[0],$bi
-	 umulh	@tmp[1],@a[1],$bi
-	 umulh	@tmp[2],@a[2],$bi
-	 umulh	@tmp[3],@a[3],$bi
-	 umulh	@tmp[4],@a[4],$bi
-	 umulh	@tmp[5],@a[5],$bi
-
-	 adds	@acc[1],@acc[1],@tmp[0]
-	mul	@tmp[0],@mod[0],$n0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$n0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$n0
-	 adcs	@acc[4],@acc[4],@tmp[3]
-	mul	@tmp[3],@mod[3],$n0
-	 adcs	@acc[5],@acc[5],@tmp[4]
-	mul	@tmp[4],@mod[4],$n0
-	 adc	@acc[6],xzr,    @tmp[5]
-	mul	@tmp[5],@mod[5],$n0
-___
-for ($i=1;$i<6;$i++) {
-$code.=<<___;
-	ldr	$bi,[$b_ptr,8*$i]
-	adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$n0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$n0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$n0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$n0
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@mod[4],$n0
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@mod[5],$n0
-	adc	@acc[6],@acc[6],xzr
-
-	ldr	$n0,[x29,#96]
-	 adds	@acc[0],@acc[1],@tmp[0]
-	mul	@tmp[0],@a[0],$bi
-	 adcs	@acc[1],@acc[2],@tmp[1]
-	mul	@tmp[1],@a[1],$bi
-	 adcs	@acc[2],@acc[3],@tmp[2]
-	mul	@tmp[2],@a[2],$bi
-	 adcs	@acc[3],@acc[4],@tmp[3]
-	mul	@tmp[3],@a[3],$bi
-	 adcs	@acc[4],@acc[5],@tmp[4]
-	mul	@tmp[4],@a[4],$bi
-	 adcs	@acc[5],@acc[6],@tmp[5]
-	mul	@tmp[5],@a[5],$bi
-	 adc	@acc[6],xzr,xzr
-
-	adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@a[0],$bi
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@a[1],$bi
-	adcs	@acc[2],@acc[2],@tmp[2]
-	mul	$n0,$n0,@acc[0]
-	 umulh	@tmp[2],@a[2],$bi
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@a[3],$bi
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@a[4],$bi
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@a[5],$bi
-	adc	@acc[6],@acc[6],xzr
-
-	 adds	@acc[1],@acc[1],@tmp[0]
-	mul	@tmp[0],@mod[0],$n0
-	 adcs	@acc[2],@acc[2],@tmp[1]
-	mul	@tmp[1],@mod[1],$n0
-	 adcs	@acc[3],@acc[3],@tmp[2]
-	mul	@tmp[2],@mod[2],$n0
-	 adcs	@acc[4],@acc[4],@tmp[3]
-	mul	@tmp[3],@mod[3],$n0
-	 adcs	@acc[5],@acc[5],@tmp[4]
-	mul	@tmp[4],@mod[4],$n0
-	 adc	@acc[6],@acc[6],@tmp[5]
-	mul	@tmp[5],@mod[5],$n0
-___
-}
-$code.=<<___;
-	adds	@acc[0],@acc[0],@tmp[0]
-	 umulh	@tmp[0],@mod[0],$n0
-	adcs	@acc[1],@acc[1],@tmp[1]
-	 umulh	@tmp[1],@mod[1],$n0
-	adcs	@acc[2],@acc[2],@tmp[2]
-	 umulh	@tmp[2],@mod[2],$n0
-	adcs	@acc[3],@acc[3],@tmp[3]
-	 umulh	@tmp[3],@mod[3],$n0
-	adcs	@acc[4],@acc[4],@tmp[4]
-	 umulh	@tmp[4],@mod[4],$n0
-	adcs	@acc[5],@acc[5],@tmp[5]
-	 umulh	@tmp[5],@mod[5],$n0
-	adc	@acc[6],@acc[6],xzr
-	 ldp	$n0,$b_ptr,[x29,#96]		// pull r_ptr
-
-	 adds	@a[0],@acc[1],@tmp[0]
-	 adcs	@a[1],@acc[2],@tmp[1]
-	 adcs	@a[2],@acc[3],@tmp[2]
-	 adcs	@a[3],@acc[4],@tmp[3]
-	 adcs	@a[4],@acc[5],@tmp[4]
-	 adcs	@a[5],@acc[6],@tmp[5]
-
-	ret
-.size	__mul_mont_383_nonred,.-__mul_mont_383_nonred
-
-.globl	sgn0_pty_mont_384
-.hidden	sgn0_pty_mont_384
-.type	sgn0_pty_mont_384,%function
-.align	5
-sgn0_pty_mont_384:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	mov	$n0,$b_ptr
-	ldp	@mod[0],@mod[1],[$a_ptr]
-	ldp	@mod[2],@mod[3],[$a_ptr,#16]
-	ldp	@mod[4],@mod[5],[$a_ptr,#32]
-	mov	$a_ptr,$r_ptr
-
-	bl	__mul_by_1_mont_384
-	ldr	x30,[x29,#8]
-
-	and	$r_ptr,@a[0],#1
-	adds	@a[0],@a[0],@a[0]
-	adcs	@a[1],@a[1],@a[1]
-	adcs	@a[2],@a[2],@a[2]
-	adcs	@a[3],@a[3],@a[3]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	$bi,xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	$bi,$bi,xzr
-
-	mvn	$bi,$bi
-	and	$bi,$bi,#2
-	orr	$r_ptr,$r_ptr,$bi
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
-
-.globl	sgn0_pty_mont_384x
-.hidden	sgn0_pty_mont_384x
-.type	sgn0_pty_mont_384x,%function
-.align	5
-sgn0_pty_mont_384x:
-	paciasp
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-
-	mov	$n0,$b_ptr
-	ldp	@mod[0],@mod[1],[$a_ptr]
-	ldp	@mod[2],@mod[3],[$a_ptr,#16]
-	ldp	@mod[4],@mod[5],[$a_ptr,#32]
-	mov	$a_ptr,$r_ptr
-
-	bl	__mul_by_1_mont_384
-	add	$a_ptr,$a_ptr,#48
-
-	and	$b_ptr,@a[0],#1
-	 orr	$n_ptr,@a[0],@a[1]
-	adds	@a[0],@a[0],@a[0]
-	 orr	$n_ptr,$n_ptr,@a[2]
-	adcs	@a[1],@a[1],@a[1]
-	 orr	$n_ptr,$n_ptr,@a[3]
-	adcs	@a[2],@a[2],@a[2]
-	 orr	$n_ptr,$n_ptr,@a[4]
-	adcs	@a[3],@a[3],@a[3]
-	 orr	$n_ptr,$n_ptr,@a[5]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	$bi,xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	$bi,$bi,xzr
-
-	mvn	$bi,$bi
-	and	$bi,$bi,#2
-	orr	$b_ptr,$b_ptr,$bi
-
-	bl	__mul_by_1_mont_384
-	ldr	x30,[x29,#8]
-
-	and	$r_ptr,@a[0],#1
-	 orr	$a_ptr,@a[0],@a[1]
-	adds	@a[0],@a[0],@a[0]
-	 orr	$a_ptr,$a_ptr,@a[2]
-	adcs	@a[1],@a[1],@a[1]
-	 orr	$a_ptr,$a_ptr,@a[3]
-	adcs	@a[2],@a[2],@a[2]
-	 orr	$a_ptr,$a_ptr,@a[4]
-	adcs	@a[3],@a[3],@a[3]
-	 orr	$a_ptr,$a_ptr,@a[5]
-	adcs	@a[4],@a[4],@a[4]
-	adcs	@a[5],@a[5],@a[5]
-	adc	$bi,xzr,xzr
-
-	subs	@a[0],@a[0],@mod[0]
-	sbcs	@a[1],@a[1],@mod[1]
-	sbcs	@a[2],@a[2],@mod[2]
-	sbcs	@a[3],@a[3],@mod[3]
-	sbcs	@a[4],@a[4],@mod[4]
-	sbcs	@a[5],@a[5],@mod[5]
-	sbc	$bi,$bi,xzr
-
-	mvn	$bi,$bi
-	and	$bi,$bi,#2
-	orr	$r_ptr,$r_ptr,$bi
-
-	cmp	$n_ptr,#0
-	csel	$n_ptr,$r_ptr,$b_ptr,eq	// a->re==0? prty(a->im) : prty(a->re)
-
-	cmp	$a_ptr,#0
-	csel	$a_ptr,$r_ptr,$b_ptr,ne	// a->im!=0? sgn0(a->im) : sgn0(a->re)
-
-	and	$n_ptr,$n_ptr,#1
-	and	$a_ptr,$a_ptr,#2
-	orr	$r_ptr,$a_ptr,$n_ptr		// pack sign and parity
-
-	ldp	x19,x20,[x29,#16]
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldr	x29,[sp],#128
-	autiasp
-	ret
-.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
-___
-
-if (0) {
-my @b = ($bi, @mod[0..4]);
-my @comba = @acc[4..6];
-
-$code.=<<___;
-.type	__mul_384_comba,%function
-.align	5
-__mul_384_comba:
-	ldp	@a[0],@a[1],[$a_ptr]
-	ldp	@b[0],@b[1],[$b_ptr]
-	ldp	@a[2],@a[3],[$a_ptr,#16]
-	ldp	@a[4],@a[5],[$a_ptr,#32]
-	ldp	@b[2],@b[3],[$b_ptr,#16]
-	ldp	@b[4],@b[5],[$b_ptr,#32]
-
-	mul	@comba[0],@a[0],@b[0]
-	umulh	@comba[1],@a[0],@b[0]
-	 mul	@acc[0],@a[1],@b[0]
-	 umulh	@acc[1],@a[1],@b[0]
-	str	@comba[0],[$r_ptr]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[2],@a[0],@b[1]
-	umulh	@acc[3],@a[0],@b[1]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],xzr,      @acc[1]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[0],@a[2],@b[0]
-	umulh	@acc[1],@a[2],@b[0]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#8]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[2],@a[1],@b[1]
-	umulh	@acc[3],@a[1],@b[1]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[0],@a[0],@b[2]
-	umulh	@acc[1],@a[0],@b[2]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[2],@a[3],@b[0]
-	 umulh	@acc[3],@a[3],@b[0]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#16]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[0],@a[2],@b[1]
-	umulh	@acc[1],@a[2],@b[1]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[2],@a[1],@b[2]
-	umulh	@acc[3],@a[1],@b[2]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[0],@a[0],@b[3]
-	umulh	@acc[1],@a[0],@b[3]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[2],@a[4],@b[0]
-	 umulh	@acc[3],@a[4],@b[0]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#24]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[0],@a[3],@b[1]
-	umulh	@acc[1],@a[3],@b[1]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[2],@a[2],@b[2]
-	umulh	@acc[3],@a[2],@b[2]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[0],@a[1],@b[3]
-	umulh	@acc[1],@a[1],@b[3]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[2],@a[0],@b[4]
-	umulh	@acc[3],@a[0],@b[4]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[0],@a[5],@b[0]
-	 umulh	@acc[1],@a[5],@b[0]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#32]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[2],@a[4],@b[1]
-	umulh	@acc[3],@a[4],@b[1]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[0],@a[3],@b[2]
-	umulh	@acc[1],@a[3],@b[2]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[2],@a[2],@b[3]
-	umulh	@acc[3],@a[2],@b[3]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[0],@a[1],@b[4]
-	umulh	@acc[1],@a[1],@b[4]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[2],@a[0],@b[5]
-	umulh	@acc[3],@a[0],@b[5]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[0],@a[5],@b[1]
-	 umulh	@acc[1],@a[5],@b[1]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#40]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[2],@a[4],@b[2]
-	umulh	@acc[3],@a[4],@b[2]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[0],@a[3],@b[3]
-	umulh	@acc[1],@a[3],@b[3]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[2],@a[2],@b[4]
-	umulh	@acc[3],@a[2],@b[4]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[0],@a[1],@b[5]
-	umulh	@acc[1],@a[1],@b[5]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[2],@a[5],@b[2]
-	 umulh	@acc[3],@a[5],@b[2]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#48]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[0],@a[4],@b[3]
-	umulh	@acc[1],@a[4],@b[3]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[2],@a[3],@b[4]
-	umulh	@acc[3],@a[3],@b[4]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	mul	@acc[0],@a[2],@b[5]
-	umulh	@acc[1],@a[2],@b[5]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[2],@a[5],@b[3]
-	 umulh	@acc[3],@a[5],@b[3]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#56]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[0],@a[4],@b[4]
-	umulh	@acc[1],@a[4],@b[4]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],xzr,xzr
-	mul	@acc[2],@a[3],@b[5]
-	umulh	@acc[3],@a[3],@b[5]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],@comba[2],xzr
-	 mul	@acc[0],@a[5],@b[4]
-	 umulh	@acc[1],@a[5],@b[4]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#64]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	mul	@acc[2],@a[4],@b[5]
-	umulh	@acc[3],@a[4],@b[5]
-	adds	@comba[0],@comba[0],@acc[0]
-	adcs	@comba[1],@comba[1],@acc[1]
-	adc	@comba[2],xzr,xzr
-	 mul	@acc[0],@a[5],@b[5]
-	 umulh	@acc[1],@a[5],@b[5]
-	adds	@comba[0],@comba[0],@acc[2]
-	adcs	@comba[1],@comba[1],@acc[3]
-	adc	@comba[2],@comba[2],xzr
-	str	@comba[0],[$r_ptr,#72]
-___
-	push(@comba,shift(@comba));
-$code.=<<___;
-	adds	@comba[0],@comba[0],@acc[0]
-	adc	@comba[1],@comba[1],@acc[1]
-	stp	@comba[0],@comba[1],[$r_ptr,#80]
-
-	ret
-.size	__mul_384_comba,.-__mul_384_comba
-___
-}
-print $code;
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl
deleted file mode 100755
index 12e58bb001e..00000000000
--- a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl
+++ /dev/null
@@ -1,513 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# As for "sparse" in subroutine names, see commentary in the
-# asm/mulx_mont_256-x86_64.pl module.
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-{ ############################################################## 256 bits
-my @acc=map("%r$_",(9..15));
-
-{ ############################################################## mulq
-my ($hi, $a0) = ("%rbp", $r_ptr);
-
-$code.=<<___;
-.text
-
-.globl	mul_mont_sparse_256
-.hidden	mul_mont_sparse_256
-.type	mul_mont_sparse_256,\@function,5,"unwind"
-.align	32
-mul_mont_sparse_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($b_org), %rax
-	mov	8*0($a_ptr), @acc[4]
-	mov	8*1($a_ptr), @acc[5]
-	mov	8*2($a_ptr), @acc[3]
-	mov	8*3($a_ptr), $hi
-	mov	$b_org, $b_ptr		# evacuate from %rdx
-
-	mov	%rax, @acc[6]
-	mulq	@acc[4]			# a[0]*b[0]
-	mov	%rax, @acc[0]
-	mov	@acc[6], %rax
-	mov	%rdx, @acc[1]
-	call	__mulq_mont_sparse_256
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_mont_sparse_256,.-mul_mont_sparse_256
-
-.globl	sqr_mont_sparse_256
-.hidden	sqr_mont_sparse_256
-.type	sqr_mont_sparse_256,\@function,4,"unwind"
-.align	32
-sqr_mont_sparse_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	8*0($a_ptr), %rax
-	mov	$n_ptr, $n0
-	mov	8*1($a_ptr), @acc[5]
-	mov	$b_org, $n_ptr
-	mov	8*2($a_ptr), @acc[3]
-	lea	($a_ptr), $b_ptr
-	mov	8*3($a_ptr), $hi
-
-	mov	%rax, @acc[6]
-	mulq	%rax			# a[0]*a[0]
-	mov	%rax, @acc[0]
-	mov	@acc[6], %rax
-	mov	%rdx, @acc[1]
-	call	__mulq_mont_sparse_256
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_mont_sparse_256,.-sqr_mont_sparse_256
-___
-{
-my @acc=@acc;
-$code.=<<___;
-.type	__mulq_mont_sparse_256,\@abi-omnipotent
-.align	32
-__mulq_mont_sparse_256:
-	mulq	@acc[5]			# a[1]*b[0]
-	add	%rax, @acc[1]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[2]
-
-	mulq	@acc[3]			# a[2]*b[0]
-	add	%rax, @acc[2]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[3]
-
-	mulq	$hi			# a[3]*b[0]
-	add	%rax, @acc[3]
-	 mov	8($b_ptr), %rax
-	adc	\$0, %rdx
-	xor	@acc[5], @acc[5]
-	mov	%rdx, @acc[4]
-
-___
-for (my $i=1; $i<4; $i++) {
-my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1];
-$code.=<<___;
-	mov	@acc[0], $a0
-	imulq	$n0, @acc[0]
-
-	################################# Multiply by b[$i]
-	mov	%rax, @acc[6]
-	mulq	8*0($a_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*1($a_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($a_ptr)
-	add	%rax, @acc[3]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($a_ptr)
-	add	%rax, @acc[4]
-	 mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[4]
-	adc	%rdx, @acc[5]		# can't overflow
-	xor	@acc[6], @acc[6]
-
-	################################# reduction
-	mulq	8*0($n_ptr)
-	add	%rax, $a0		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, $a0
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$a0, @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	add	%rax, @acc[3]
-	 mov	$b_next, %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	add	%rdx, @acc[4]
-	adc	\$0, @acc[5]
-	adc	\$0, @acc[6]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	imulq	$n0, %rax
-	mov	8(%rsp), $a_ptr		# restore $r_ptr
-
-	################################# last reduction
-	mov	%rax, @acc[6]
-	mulq	8*0($n_ptr)
-	add	%rax, @acc[0]		# guaranteed to be zero
-	mov	@acc[6], %rax
-	adc	%rdx, @acc[0]
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	add	@acc[0], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	 mov	@acc[2], $b_ptr
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	add	%rax, @acc[3]
-	 mov	@acc[1], %rax
-	adc	\$0, %rdx
-	add	%rdx, @acc[4]
-	adc	\$0, @acc[5]
-
-	#################################
-	# Branch-less conditional subtraction of modulus
-
-	 mov	@acc[3], @acc[0]
-	sub	8*0($n_ptr), @acc[1]
-	sbb	8*1($n_ptr), @acc[2]
-	sbb	8*2($n_ptr), @acc[3]
-	 mov	@acc[4], $hi
-	sbb	8*3($n_ptr), @acc[4]
-	sbb	\$0, @acc[5]
-
-	cmovc	%rax, @acc[1]
-	cmovc	$b_ptr, @acc[2]
-	cmovc	@acc[0], @acc[3]
-	mov	@acc[1], 8*0($a_ptr)
-	cmovc	$hi, @acc[4]
-	mov	@acc[2], 8*1($a_ptr)
-	mov	@acc[3], 8*2($a_ptr)
-	mov	@acc[4], 8*3($a_ptr)
-
-	ret
-.cfi_endproc
-.size	__mulq_mont_sparse_256,.-__mulq_mont_sparse_256
-___
-} }
-{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
-
-$code.=<<___;
-.globl	from_mont_256
-.hidden	from_mont_256
-.type	from_mont_256,\@function,4,"unwind"
-.align	32
-from_mont_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulq_by_1_mont_256
-
-	#################################
-	# Branch-less conditional acc[0:3] - modulus
-
-	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
-	mov	@acc[5], @acc[1]
-	mov	@acc[6], @acc[2]
-	mov	@acc[0], @acc[3]
-
-	sub	8*0($n_ptr), @acc[4]
-	sbb	8*1($n_ptr), @acc[5]
-	sbb	8*2($n_ptr), @acc[6]
-	sbb	8*3($n_ptr), @acc[0]
-
-	cmovnc	@acc[4], %rax
-	cmovnc	@acc[5], @acc[1]
-	cmovnc	@acc[6], @acc[2]
-	mov	%rax,    8*0($r_ptr)
-	cmovnc	@acc[0], @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	from_mont_256,.-from_mont_256
-
-.globl	redc_mont_256
-.hidden	redc_mont_256
-.type	redc_mont_256,\@function,4,"unwind"
-.align	32
-redc_mont_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulq_by_1_mont_256
-
-	add	8*4($a_ptr), @acc[4]	# accumulate upper half
-	adc	8*5($a_ptr), @acc[5]
-	mov	@acc[4], %rax
-	adc	8*6($a_ptr), @acc[6]
-	mov	@acc[5], @acc[1]
-	adc	8*7($a_ptr), @acc[0]
-	sbb	$a_ptr, $a_ptr
-
-	#################################
-	# Branch-less conditional acc[0:4] - modulus
-
-	mov	@acc[6], @acc[2]
-	sub	8*0($n_ptr), @acc[4]
-	sbb	8*1($n_ptr), @acc[5]
-	sbb	8*2($n_ptr), @acc[6]
-	mov	@acc[0], @acc[3]
-	sbb	8*3($n_ptr), @acc[0]
-	sbb	\$0, $a_ptr
-
-	cmovnc	@acc[4], %rax 
-	cmovnc	@acc[5], @acc[1]
-	cmovnc	@acc[6], @acc[2]
-	mov	%rax,    8*0($r_ptr)
-	cmovnc	@acc[0], @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	redc_mont_256,.-redc_mont_256
-___
-{
-my @acc=@acc;
-
-$code.=<<___;
-.type	__mulq_by_1_mont_256,\@abi-omnipotent
-.align	32
-__mulq_by_1_mont_256:
-	mov	8*0($a_ptr), %rax
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-	mov	%rax, @acc[4]
-	imulq	$n0, %rax
-	mov	%rax, @acc[0]
-___
-for (my $i=0; $i<4; $i++) {
-my $hi = @acc[4];
-$code.=<<___;
-	################################# reduction $i
-	mulq	8*0($n_ptr)
-	add	%rax, @acc[4]		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, @acc[4]
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[4], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-___
-$code.=<<___	if ($i<3);
-	 mov	@acc[1], @acc[5]
-	 imulq	$n0, @acc[1]
-___
-$code.=<<___;
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	add	%rax, @acc[3]
-	mov	@acc[1], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	ret
-.size	__mulq_by_1_mont_256,.-__mulq_by_1_mont_256
-___
-} } }
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl
deleted file mode 100755
index 3812319e8ba..00000000000
--- a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl
+++ /dev/null
@@ -1,2675 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-# common accumulator layout
-@acc=map("%r$_",(8..15));
-
-########################################################################
-{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
-						# except for $n_ptr and $r_ptr
-$code.=<<___;
-.text
-
-########################################################################
-# Double-width subtraction modulo n<<384, as opposite to naively
-# expected modulo n*n. It works because n<<384 is the actual
-# input boundary condition for Montgomery reduction, not n*n.
-# Just in case, this is duplicated, but only one module is
-# supposed to be linked...
-.type	__sub_mod_384x384,\@abi-omnipotent
-.align	32
-__sub_mod_384x384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	8*6($a_ptr), @acc[6]
-
-	sub	8*0($b_org), @acc[0]
-	mov	8*7($a_ptr), @acc[7]
-	sbb	8*1($b_org), @acc[1]
-	mov	8*8($a_ptr), @acc[8]
-	sbb	8*2($b_org), @acc[2]
-	mov	8*9($a_ptr), @acc[9]
-	sbb	8*3($b_org), @acc[3]
-	mov	8*10($a_ptr), @acc[10]
-	sbb	8*4($b_org), @acc[4]
-	mov	8*11($a_ptr), @acc[11]
-	sbb	8*5($b_org), @acc[5]
-	 mov	@acc[0], 8*0($r_ptr)
-	sbb	8*6($b_org), @acc[6]
-	 mov	8*0($n_ptr), @acc[0]
-	 mov	@acc[1], 8*1($r_ptr)
-	sbb	8*7($b_org), @acc[7]
-	 mov	8*1($n_ptr), @acc[1]
-	 mov	@acc[2], 8*2($r_ptr)
-	sbb	8*8($b_org), @acc[8]
-	 mov	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], 8*3($r_ptr)
-	sbb	8*9($b_org), @acc[9]
-	 mov	8*3($n_ptr), @acc[3]
-	 mov	@acc[4], 8*4($r_ptr)
-	sbb	8*10($b_org), @acc[10]
-	 mov	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], 8*5($r_ptr)
-	sbb	8*11($b_org), @acc[11]
-	 mov	8*5($n_ptr), @acc[5]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[0]
-	and	$b_org, @acc[1]
-	and	$b_org, @acc[2]
-	and	$b_org, @acc[3]
-	and	$b_org, @acc[4]
-	and	$b_org, @acc[5]
-
-	add	@acc[0], @acc[6]
-	adc	@acc[1], @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	@acc[2], @acc[8]
-	mov	@acc[7], 8*7($r_ptr)
-	adc	@acc[3], @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	@acc[4], @acc[10]
-	mov	@acc[9], 8*9($r_ptr)
-	adc	@acc[5], @acc[11]
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__sub_mod_384x384,.-__sub_mod_384x384
-
-.type	__add_mod_384,\@abi-omnipotent
-.align	32
-__add_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	add	8*0($b_org), @acc[0]
-	adc	8*1($b_org), @acc[1]
-	adc	8*2($b_org), @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	8*3($b_org), @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	8*4($b_org), @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	8*5($b_org), @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	@acc[9],  @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__add_mod_384,.-__add_mod_384
-
-.type	__sub_mod_384,\@abi-omnipotent
-.align	32
-__sub_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-__sub_mod_384_a_is_loaded:
-	sub	8*0($b_org), @acc[0]
-	 mov	8*0($n_ptr), @acc[6]
-	sbb	8*1($b_org), @acc[1]
-	 mov	8*1($n_ptr), @acc[7]
-	sbb	8*2($b_org), @acc[2]
-	 mov	8*2($n_ptr), @acc[8]
-	sbb	8*3($b_org), @acc[3]
-	 mov	8*3($n_ptr), @acc[9]
-	sbb	8*4($b_org), @acc[4]
-	 mov	8*4($n_ptr), @acc[10]
-	sbb	8*5($b_org), @acc[5]
-	 mov	8*5($n_ptr), @acc[11]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[6]
-	and	$b_org, @acc[7]
-	and	$b_org, @acc[8]
-	and	$b_org, @acc[9]
-	and	$b_org, @acc[10]
-	and	$b_org, @acc[11]
-
-	add	@acc[6], @acc[0]
-	adc	@acc[7], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[8], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[9], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	adc	@acc[10], @acc[4]
-	mov	@acc[3], 8*3($r_ptr)
-	adc	@acc[11], @acc[5]
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__sub_mod_384,.-__sub_mod_384
-___
-}
-
-########################################################################
-# "Complex" multiplication and squaring. Use vanilla multiplication when
-# possible to fold reductions. I.e. instead of mul_mont, mul_mont
-# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
-# followed by *common* reduction...
-{ my $frame = 5*8 +	# place for argument off-load +
-	      3*768/8;	# place for 3 768-bit temporary vectors
-$code.=<<___;
-.globl	mul_mont_384x
-.hidden	mul_mont_384x
-.type	mul_mont_384x,\@function,5,"unwind"
-.align	32
-mul_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr
-	mov	$r_ptr, 8*4(%rsp)	# offload arguments
-	mov	$a_ptr, 8*3(%rsp)
-	mov	$b_org, 8*2(%rsp)
-	mov	$n_ptr, 8*1(%rsp)
-	mov	$n0,    8*0(%rsp)
-
-	################################# mul_384(t0, a->re, b->re);
-	#lea	0($b_btr), $b_ptr	# b->re
-	#lea	0($a_ptr), $a_ptr	# a->re
-	lea	40(%rsp), $r_ptr	# t0
-	call	__mulq_384
-
-	################################# mul_384(t1, a->im, b->im);
-	lea	48($b_ptr), $b_ptr	# b->im
-	lea	48($a_ptr), $a_ptr	# a->im
-	lea	40+96(%rsp), $r_ptr	# t1
-	call	__mulq_384
-
-	################################# mul_384(t2, a->re+a->im, b->re+b->im);
-	mov	8*1(%rsp), $n_ptr
-	lea	-48($a_ptr), $b_org
-	lea	40+192+48(%rsp), $r_ptr
-	call	__add_mod_384
-
-	mov	8*2(%rsp), $a_ptr
-	lea	48($a_ptr), $b_org
-	lea	-48($r_ptr), $r_ptr
-	call	__add_mod_384
-
-	lea	($r_ptr),$b_ptr
-	lea	48($r_ptr),$a_ptr
-	call	__mulq_384
-
-	################################# t2=t2-t0-t1
-	lea	($r_ptr), $a_ptr	# t2
-	lea	40(%rsp), $b_org	# t0
-	mov	8*1(%rsp), $n_ptr
-	call	__sub_mod_384x384	# t2=t2-t0
-
-	lea	($r_ptr), $a_ptr	# t2
-	lea	-96($r_ptr), $b_org	# t1
-	call	__sub_mod_384x384	# t2=t2-t1
-
-	################################# t0=t0-t1
-	lea	40(%rsp), $a_ptr
-	lea	40+96(%rsp), $b_org
-	lea	40(%rsp), $r_ptr
-	call	__sub_mod_384x384	# t0-t1
-
-	mov	$n_ptr, $b_ptr		# n_ptr for redc_mont_384
-
-	################################# redc_mont_384(ret->re, t0, mod, n0);
-	lea	40(%rsp), $a_ptr	# t0
-	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
-	mov	8*4(%rsp), $r_ptr	# ret->re
-	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	################################# redc_mont_384(ret->im, t2, mod, n0);
-	lea	40+192(%rsp), $a_ptr	# t2
-	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
-	lea	48($r_ptr), $r_ptr	# ret->im
-	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_mont_384x,.-mul_mont_384x
-___
-}
-{ my $frame = 4*8 +	# place for argument off-load +
-	      2*384/8 +	# place for 2 384-bit temporary vectors
-	      8;	# align
-$code.=<<___;
-.globl	sqr_mont_384x
-.hidden	sqr_mont_384x
-.type	sqr_mont_384x,\@function,4,"unwind"
-.align	32
-sqr_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$n_ptr, 8*0(%rsp)	# n0
-	mov	$b_org, $n_ptr		# n_ptr
-	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
-	mov	$a_ptr, 8*2(%rsp)
-
-	################################# add_mod_384(t0, a->re, a->im);
-	lea	48($a_ptr), $b_org	# a->im
-	lea	32(%rsp), $r_ptr	# t0
-	call	__add_mod_384
-
-	################################# sub_mod_384(t1, a->re, a->im);
-	mov	8*2(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_org	# a->im
-	lea	32+48(%rsp), $r_ptr	# t1
-	call	__sub_mod_384
-
-	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
-	mov	8*2(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_ptr	# a->im
-
-	mov	48($a_ptr), %rax	# a->im
-	mov	8*0($a_ptr), @acc[6]	# a->re
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[4]
-	mov	8*3($a_ptr), @acc[5]
-
-	call	__mulq_mont_384
-___
-{
-my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
-                     12,13,"ax","bx","bp","si");
-$code.=<<___;
-	add	@acc[0], @acc[0]	# add with itself
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	@acc[3], @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	@acc[4], @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	@acc[5], @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	mov	@acc[0],  8*6($r_ptr)	# ret->im
-	cmovc	@acc[9],  @acc[3]
-	mov	@acc[1],  8*7($r_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2],  8*8($r_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3],  8*9($r_ptr)
-	mov	@acc[4],  8*10($r_ptr)
-	mov	@acc[5],  8*11($r_ptr)
-___
-}
-$code.=<<___;
-	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
-	lea	32(%rsp), $a_ptr	# t0
-	lea	32+48(%rsp), $b_ptr	# t1
-
-	mov	32+48(%rsp), %rax	# t1[0]
-	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
-	mov	32+8*1(%rsp), @acc[7]
-	mov	32+8*2(%rsp), @acc[4]
-	mov	32+8*3(%rsp), @acc[5]
-
-	call	__mulq_mont_384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_mont_384x,.-sqr_mont_384x
-
-.globl	mul_382x
-.hidden	mul_382x
-.type	mul_382x,\@function,4,"unwind"
-.align	32
-mul_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	96($r_ptr), $r_ptr	# ret->im
-	mov	$a_ptr, 8*0(%rsp)
-	mov	$b_org, 8*1(%rsp)
-	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
-	mov	$n_ptr, 8*3(%rsp)
-
-	################################# t0 = a->re + a->im
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	add	8*6($a_ptr), @acc[0]
-	adc	8*7($a_ptr), @acc[1]
-	adc	8*8($a_ptr), @acc[2]
-	adc	8*9($a_ptr), @acc[3]
-	adc	8*10($a_ptr), @acc[4]
-	adc	8*11($a_ptr), @acc[5]
-
-	mov	@acc[0], 32+8*0(%rsp)
-	mov	@acc[1], 32+8*1(%rsp)
-	mov	@acc[2], 32+8*2(%rsp)
-	mov	@acc[3], 32+8*3(%rsp)
-	mov	@acc[4], 32+8*4(%rsp)
-	mov	@acc[5], 32+8*5(%rsp)
-
-	################################# t1 = b->re + b->im
-	mov	8*0($b_org), @acc[0]
-	mov	8*1($b_org), @acc[1]
-	mov	8*2($b_org), @acc[2]
-	mov	8*3($b_org), @acc[3]
-	mov	8*4($b_org), @acc[4]
-	mov	8*5($b_org), @acc[5]
-
-	add	8*6($b_org), @acc[0]
-	adc	8*7($b_org), @acc[1]
-	adc	8*8($b_org), @acc[2]
-	adc	8*9($b_org), @acc[3]
-	adc	8*10($b_org), @acc[4]
-	adc	8*11($b_org), @acc[5]
-
-	mov	@acc[0], 32+8*6(%rsp)
-	mov	@acc[1], 32+8*7(%rsp)
-	mov	@acc[2], 32+8*8(%rsp)
-	mov	@acc[3], 32+8*9(%rsp)
-	mov	@acc[4], 32+8*10(%rsp)
-	mov	@acc[5], 32+8*11(%rsp)
-
-	################################# mul_384(ret->im, t0, t1);
-	lea	32+8*0(%rsp), $a_ptr	# t0
-	lea	32+8*6(%rsp), $b_ptr	# t1
-	call	__mulq_384
-
-	################################# mul_384(ret->re, a->re, b->re);
-	mov	8*0(%rsp), $a_ptr
-	mov	8*1(%rsp), $b_ptr
-	lea	-96($r_ptr), $r_ptr	# ret->re
-	call	__mulq_384
-
-	################################# mul_384(tx, a->im, b->im);
-	lea	48($a_ptr), $a_ptr
-	lea	48($b_ptr), $b_ptr
-	lea	32(%rsp), $r_ptr
-	call	__mulq_384
-
-	################################# ret->im -= tx
-	mov	8*2(%rsp), $a_ptr	# restore ret->im
-	lea	32(%rsp), $b_org
-	mov	8*3(%rsp), $n_ptr
-	mov	$a_ptr, $r_ptr
-	call	__sub_mod_384x384
-
-	################################# ret->im -= ret->re
-	lea	0($r_ptr), $a_ptr
-	lea	-96($r_ptr), $b_org
-	call	__sub_mod_384x384
-
-	################################# ret->re -= tx
-	lea	-96($r_ptr), $a_ptr
-	lea	32(%rsp), $b_org
-	lea	-96($r_ptr), $r_ptr
-	call	__sub_mod_384x384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_382x,.-mul_382x
-___
-}
-{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
-						# except for $n_ptr and $r_ptr
-$code.=<<___;
-.globl	sqr_382x
-.hidden	sqr_382x
-.type	sqr_382x,\@function,3,"unwind"
-.align	32
-sqr_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$a_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-
-	################################# t0 = a->re + a->im
-	mov	8*0($a_ptr), @acc[6]
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[9]
-	mov	8*4($a_ptr), @acc[10]
-	mov	8*5($a_ptr), @acc[11]
-
-	mov	@acc[6], @acc[0]
-	add	8*6($a_ptr), @acc[6]
-	mov	@acc[7], @acc[1]
-	adc	8*7($a_ptr), @acc[7]
-	mov	@acc[8], @acc[2]
-	adc	8*8($a_ptr), @acc[8]
-	mov	@acc[9], @acc[3]
-	adc	8*9($a_ptr), @acc[9]
-	mov	@acc[10], @acc[4]
-	adc	8*10($a_ptr), @acc[10]
-	mov	@acc[11], @acc[5]
-	adc	8*11($a_ptr), @acc[11]
-
-	mov	@acc[6], 8*0($r_ptr)
-	mov	@acc[7], 8*1($r_ptr)
-	mov	@acc[8], 8*2($r_ptr)
-	mov	@acc[9], 8*3($r_ptr)
-	mov	@acc[10], 8*4($r_ptr)
-	mov	@acc[11], 8*5($r_ptr)
-
-	################################# t1 = a->re - a->im
-	lea	48($a_ptr), $b_org
-	lea	48($r_ptr), $r_ptr
-	call	__sub_mod_384_a_is_loaded
-
-	################################# mul_384(ret->re, t0, t1);
-	lea	($r_ptr), $a_ptr
-	lea	-48($r_ptr), $b_ptr
-	lea	-48($r_ptr), $r_ptr
-	call	__mulq_384
-
-	################################# mul_384(ret->im, a->re, a->im);
-	mov	(%rsp), $a_ptr
-	lea	48($a_ptr), $b_ptr
-	lea	96($r_ptr), $r_ptr
-	call	__mulq_384
-
-	mov	8*0($r_ptr), @acc[0]	# double ret->im
-	mov	8*1($r_ptr), @acc[1]
-	mov	8*2($r_ptr), @acc[2]
-	mov	8*3($r_ptr), @acc[3]
-	mov	8*4($r_ptr), @acc[4]
-	mov	8*5($r_ptr), @acc[5]
-	mov	8*6($r_ptr), @acc[6]
-	mov	8*7($r_ptr), @acc[7]
-	mov	8*8($r_ptr), @acc[8]
-	mov	8*9($r_ptr), @acc[9]
-	mov	8*10($r_ptr), @acc[10]
-	add	@acc[0], @acc[0]
-	mov	8*11($r_ptr), @acc[11]
-	adc	@acc[1], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[2], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[3], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	adc	@acc[4], @acc[4]
-	mov	@acc[3], 8*3($r_ptr)
-	adc	@acc[5], @acc[5]
-	mov	@acc[4], 8*4($r_ptr)
-	adc	@acc[6], @acc[6]
-	mov	@acc[5], 8*5($r_ptr)
-	adc	@acc[7], @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	@acc[8], @acc[8]
-	mov	@acc[7], 8*7($r_ptr)
-	adc	@acc[9], @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	@acc[10], @acc[10]
-	mov	@acc[9], 8*9($r_ptr)
-	adc	@acc[11], @acc[11]
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	mov	8*1(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*2(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*3(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*4(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*5(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*6(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*7(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*7
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_382x,.-sqr_382x
-___
-}
-{ ########################################################## 384-bit mul
-my @acc=map("%r$_",("cx",8..12));
-my $bi = "%rbp";
-
-$code.=<<___;
-.globl	mul_384
-.hidden	mul_384
-.type	mul_384,\@function,3,"unwind"
-.align	32
-mul_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr
-	call	__mulq_384
-
-	mov	0(%rsp),%r12
-.cfi_restore	%r12
-	mov	8(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	16(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	24(%rsp),%rsp
-.cfi_adjust_cfa_offset	-24
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_384,.-mul_384
-
-.type	__mulq_384,\@abi-omnipotent
-.align	32
-__mulq_384:
-	mov	8*0($b_ptr), %rax
-
-	mov	%rax, $bi
-	mulq	8*0($a_ptr)
-	mov	%rax, 8*0($r_ptr)
-	mov	$bi, %rax
-	mov	%rdx, @acc[0]
-
-	mulq	8*1($a_ptr)
-	add	%rax, @acc[0]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[1]
-
-	mulq	8*2($a_ptr)
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[2]
-
-	mulq	8*3($a_ptr)
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[3]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-
-	mulq	8*5($a_ptr)
-	add	%rax, @acc[4]
-	mov	8*1($b_ptr), %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[5]
-___
-for(my $i=1; $i<6; $i++) {
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
-$code.=<<___;
-	mov	%rax, $bi
-	mulq	8*0($a_ptr)
-	add	%rax, @acc[0]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	@acc[0], 8*$i($r_ptr)
-	mov	%rdx, @acc[0]
-
-	mulq	8*1($a_ptr)
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[1], @acc[0]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[1]
-
-	mulq	8*2($a_ptr)
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[2], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[2]
-
-	mulq	8*3($a_ptr)
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[3], @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[3]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[4]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[4], @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-
-	mulq	8*5($a_ptr)
-	add	%rax, @acc[5]
-	mov	$b_next, %rax
-	adc	\$0, %rdx
-	add	@acc[5], @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[5]
-___
-}
-$code.=<<___;
-	mov	@acc[0], 8*6($r_ptr)
-	mov	@acc[1], 8*7($r_ptr)
-	mov	@acc[2], 8*8($r_ptr)
-	mov	@acc[3], 8*9($r_ptr)
-	mov	@acc[4], 8*10($r_ptr)
-	mov	@acc[5], 8*11($r_ptr)
-
-	ret
-.size	__mulq_384,.-__mulq_384
-___
-}
-if (0) { ##############################################################
-my @b=map("%r$_",(10..15));
-my @a=reverse(@b);
-   @b[5]=$b_ptr;
-my $bi = "%rbp";
-my @comba=map("%r$_",("cx",8,9));
-#                                                   a[0]*b[0]
-#                                              a[1]*b[0]
-#                                              a[0]*b[1]
-#                                         a[2]*b[0]
-#                                         a[1]*b[1]
-#                                         a[0]*b[2]
-#                                    a[3]*b[0]
-#                                    a[2]*b[1]
-#                                    a[1]*b[2]
-#                                    a[0]*b[3]
-#                               a[4]*b[0]
-#                               a[3]*b[1]
-#                               a[2]*b[2]
-#                               a[1]*b[3]
-#                               a[0]*b[4]
-#                          a[5]*b[0]
-#                          a[4]*b[1]
-#                          a[3]*b[2]
-#                          a[2]*b[3]
-#                          a[1]*b[4]
-#                          a[0]*b[5]
-#                     a[5]*b[1]
-#                     a[4]*b[2]
-#                     a[3]*b[3]
-#                     a[2]*b[4]
-#                     a[1]*b[5]
-#                a[5]*b[2]
-#                a[4]*b[3]
-#                a[3]*b[4]
-#                a[2]*b[5]
-#           a[5]*b[3]
-#           a[4]*b[4]
-#           a[3]*b[5]
-#      a[5]*b[4]
-#      a[4]*b[5]
-# a[5]*b[5]
-#
-# 13% less instructions give +15% on Core2, +10% on Goldmont,
-# -0% on Sandy Bridge, but -16% on Haswell:-(
-# [for reference +5% on Skylake, +11% on Ryzen]
-
-$code.=<<___;
-.type	__mulq_comba_384,\@abi-omnipotent
-.align	32
-__mulq_comba_384:
-	mov	8*0($b_ptr), %rax
-	mov	8*0($a_ptr), @a[0]
-	mov	8*1($a_ptr), @a[1]
-	mov	8*1($b_ptr), @b[1]
-
-	mov	%rax, @b[0]
-	mulq	@a[0]			# a[0]*b[0]
-	mov	%rax, 8*0($r_ptr)
-	mov	@b[0], %rax
-	mov	%rdx, @comba[0]
-
-	#################################
-	mov	8*2($a_ptr), @a[2]
-	xor	@comba[2], @comba[2]
-	mulq	@a[1]			# a[1]*b[0]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	\$0, %rdx
-	mov	8*2($b_ptr), @b[2]
-	mov	%rdx, @comba[1]
-
-	mulq	@a[0]			# a[0]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[0], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*1($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[2]			# a[2]*b[0]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[1]			# a[1]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[2], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[0]			# a[0]*b[2]
-	add	%rax, @comba[0]
-	mov	@b[0], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*2($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	8*3($a_ptr)		# a[3]*b[0]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[2]			# a[2]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[2], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[1]			# a[1]*b[2]
-	add	%rax, @comba[0]
-	mov	8*3($b_ptr), %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mov	%rax, @b[3]
-	mulq	@a[0]			# a[0]*b[3]
-	add	%rax, @comba[0]
-	mov	@b[0], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*3($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	8*4($a_ptr)		# a[4]*b[0]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*3($a_ptr)		# a[3]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[2], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*2($a_ptr)		# a[2]*b[2]
-	add	%rax, @comba[0]
-	mov	@b[3], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[1]			# a[1]*b[3]
-	add	%rax, @comba[0]
-	mov	8*4($b_ptr), %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mov	%rax, @b[4]
-	mulq	@a[0]			# a[0]*b[4]
-	add	%rax, @comba[0]
-	mov	@b[0], %rax
-	adc	%rdx, @comba[1]
-	mov	8*5($a_ptr), @a[5]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*4($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[5]			# a[5]*b[0]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*4($a_ptr)		# a[4]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[2], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*3($a_ptr)		# a[3]*b[2]
-	add	%rax, @comba[0]
-	mov	@b[3], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*2($a_ptr)		# a[2]*b[3]
-	add	%rax, @comba[0]
-	mov	@b[4], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*1($a_ptr)		# a[1]*b[4]
-	add	%rax, @comba[0]
-	mov	8*5($b_ptr), %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mov	%rax, @b[5]
-	mulq	@a[0]			# a[0]*b[5]
-	add	%rax, @comba[0]
-	mov	@b[1], %rax
-	adc	%rdx, @comba[1]
-	mov	8*4($a_ptr), @a[4]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*5($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[5]			# a[5]*b[1]
-	add	%rax, @comba[0]
-	mov	@b[2], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[4]			# a[4]*b[2]
-	add	%rax, @comba[0]
-	mov	@b[3], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*3($a_ptr)		# a[3]*b[3]
-	add	%rax, @comba[0]
-	mov	@b[4], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*2($a_ptr)		# a[2]*b[4]
-	add	%rax, @comba[0]
-	mov	@b[5], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*1($a_ptr)		# a[1]*b[5]
-	add	%rax, @comba[0]
-	mov	$b[2], %rax
-	adc	%rdx, @comba[1]
-	mov	8*3($a_ptr), @a[3]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*6($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[5]			# a[5]*b[2]
-	add	%rax, @comba[0]
-	mov	@b[3], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[4]			# a[4]*b[3]
-	add	%rax, @comba[0]
-	mov	@b[4], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[3]			# a[3]*b[4]
-	add	%rax, @comba[0]
-	mov	@b[5], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	8*2($a_ptr)		# a[2]*b[5]
-	add	%rax, @comba[0]
-	mov	@b[3], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*7($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[5]			# a[5]*b[3]
-	add	%rax, @comba[0]
-	mov	@b[4], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[4]			# a[4]*b[4]
-	add	%rax, @comba[0]
-	mov	@b[5], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[3]			# a[3]*b[5]
-	add	%rax, @comba[0]
-	mov	@b[4], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*8($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	xor	@comba[2], @comba[2]
-	mulq	@a[5]			# a[5]*b[4]
-	add	%rax, @comba[0]
-	mov	@b[5], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-
-	mulq	@a[4]			# a[4]*b[5]
-	add	%rax, @comba[0]
-	mov	@b[5], %rax
-	adc	%rdx, @comba[1]
-	adc	\$0, @comba[2]
-	mov	@comba[0], 8*9($r_ptr)
-___
-    push(@comba,shift(@comba));
-$code.=<<___;
-	mulq	@a[5]			# a[5]*b[4]
-	add	%rax, @comba[0]
-	adc	%rdx, @comba[1]
-
-	mov	@comba[0], 8*10($r_ptr)
-	mov	@comba[1], 8*11($r_ptr)
-
-	ret
-.size	__mulq_comba_384,.-__mulq_comba_384
-___
-}
-{ ########################################################## 384-bit sqr
-my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr);
-my $hi;
-
-$code.=<<___;
-.globl	sqr_384
-.hidden	sqr_384
-.type	sqr_384,\@function,2,"unwind"
-.align	32
-sqr_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__sqrq_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_384,.-sqr_384
-
-.type	__sqrq_384,\@abi-omnipotent
-.align	32
-__sqrq_384:
-	mov	8*0($a_ptr), %rax
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[9]
-
-	#########################################
-	mov	%rax, @acc[6]
-	mulq	@acc[7]				# a[1]*a[0]
-	mov	%rax, @acc[1]
-	mov	@acc[6], %rax
-	 mov	8*4($a_ptr), @acc[10]
-	mov	%rdx, @acc[2]
-
-	mulq	@acc[8]				# a[2]*a[0]
-	add	%rax, @acc[2]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	 mov	8*5($a_ptr), @acc[11]
-	mov	%rdx, @acc[3]
-
-	mulq	@acc[9]				# a[3]*a[0]
-	add	%rax, @acc[3]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-
-	mulq	@acc[10]			# a[4]*a[0]
-	add	%rax, @acc[4]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[5]
-
-	mulq	@acc[11]			# a[5]*a[0]
-	add	%rax, @acc[5]
-	mov	@acc[6], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	mulq	%rax				# a[0]*a[0]
-	xor	@acc[0], @acc[0]
-	mov	%rax, 8*0($r_ptr)
-	 mov	@acc[7], %rax
-	add	@acc[1], @acc[1]		# double acc[1]
-	adc	\$0, @acc[0]
-	add	%rdx, @acc[1]			# accumulate a[0]*a[0]
-	adc	\$0, @acc[0]			# carries to a[1]*a[1]
-	mov	@acc[1], 8*1($r_ptr)
-___
-$hi=@acc[1];
-$code.=<<___;
-	#########################################
-	mulq	@acc[8]				# a[2]*a[1]
-	add	%rax, @acc[3]
-	mov	@acc[7], %rax
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	@acc[9]				# a[3]*a[1]
-	add	%rax, @acc[4]
-	mov	@acc[7], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	@acc[10]			# a[4]*a[1]
-	add	%rax, @acc[5]
-	mov	@acc[7], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[5]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	@acc[11]			# a[5]*a[1]
-	add	%rax, @acc[6]
-	mov	@acc[7], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[6]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	%rax				# a[1]*a[1]
-	xor	@acc[1], @acc[1]
-	add	%rax, @acc[0]			# can't carry
-	 mov	@acc[8], %rax
-	add	@acc[2], @acc[2]		# double acc[2:3]
-	adc	@acc[3], @acc[3]
-	adc	\$0, @acc[1]
-	add	@acc[0], @acc[2]		# accumulate a[1]*a[1]
-	adc	%rdx, @acc[3]
-	adc	\$0, @acc[1]			# carries to a[2]*a[2]
-	mov	@acc[2], 8*2($r_ptr)
-___
-$hi=@acc[0];
-$code.=<<___;
-	#########################################
-	mulq	@acc[9]				# a[3]*a[2]
-	add	%rax, @acc[5]
-	mov	@acc[8], %rax
-	adc	\$0, %rdx
-	 mov	@acc[3], 8*3($r_ptr)
-	mov	%rdx, $hi
-
-	mulq	@acc[10]			# a[4]*a[2]
-	add	%rax, @acc[6]
-	mov	@acc[8], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[6]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	@acc[11]			# a[5]*a[2]
-	add	%rax, @acc[7]
-	mov	@acc[8], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[7]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[8]
-
-	mulq	%rax				# a[2]*a[2]
-	xor	@acc[3], @acc[3]
-	add	%rax, @acc[1]			# can't carry
-	 mov	@acc[9], %rax
-	add	@acc[4], @acc[4]		# double acc[4:5]
-	adc	@acc[5], @acc[5]
-	adc	\$0, @acc[3]
-	add	@acc[1], @acc[4]		# accumulate a[2]*a[2]
-	adc	%rdx, @acc[5]
-	adc	\$0, @acc[3]			# carries to a[3]*a[3]
-	mov	@acc[4], 8*4($r_ptr)
-
-	#########################################
-	mulq	@acc[10]			# a[4]*a[3]
-	add	%rax, @acc[7]
-	mov	@acc[9], %rax
-	adc	\$0, %rdx
-	 mov	@acc[5], 8*5($r_ptr)
-	mov	%rdx, $hi
-
-	mulq	@acc[11]			# a[5]*a[3]
-	add	%rax, @acc[8]
-	mov	@acc[9], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[8]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[9]
-
-	mulq	%rax				# a[3]*a[3]
-	xor	@acc[4], @acc[4]
-	add	%rax, @acc[3]			# can't carry
-	 mov	@acc[10], %rax
-	add	@acc[6], @acc[6]		# double acc[6:7]
-	adc	@acc[7], @acc[7]
-	adc	\$0, @acc[4]
-	add	@acc[3], @acc[6]		# accumulate a[3]*a[3]
-	adc	%rdx, @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	\$0, @acc[4]			# carries to a[4]*a[4]
-	mov	@acc[7], 8*7($r_ptr)
-
-	#########################################
-	mulq	@acc[11]			# a[5]*a[4]
-	add	%rax, @acc[9]
-	mov	@acc[10], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[10]
-
-	mulq	%rax				# a[4]*a[4]
-	xor	@acc[5], @acc[5]
-	add	%rax, @acc[4]			# can't carry
-	 mov	@acc[11], %rax
-	add	@acc[8], @acc[8]		# double acc[8:9]
-	adc	@acc[9], @acc[9]
-	adc	\$0, @acc[5]
-	add	@acc[4], @acc[8]		# accumulate a[4]*a[4]
-	adc	%rdx, @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	\$0, @acc[5]			# carries to a[5]*a[5]
-	mov	@acc[9], 8*9($r_ptr)
-
-	#########################################
-	mulq	%rax				# a[5]*a[5]
-	add	@acc[5], %rax			# can't carry
-	add	@acc[10], @acc[10]		# double acc[10]
-	adc	\$0, %rdx
-	add	@acc[10], %rax			# accumulate a[5]*a[5]
-	adc	\$0, %rdx
-	mov	%rax, 8*10($r_ptr)
-	mov	%rdx, 8*11($r_ptr)
-
-	ret
-.size	__sqrq_384,.-__sqrq_384
-
-.globl	sqr_mont_384
-.hidden	sqr_mont_384
-.type	sqr_mont_384,\@function,4,"unwind"
-.align	32
-sqr_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8*15, %rsp
-.cfi_adjust_cfa_offset	8*15
-.cfi_end_prologue
-
-	mov	$n_ptr, 8*12(%rsp)	# n0
-	mov	$b_org, 8*13(%rsp)	# n_ptr
-	mov	$r_ptr, 8*14(%rsp)
-
-	mov	%rsp, $r_ptr
-	call	__sqrq_384
-
-	lea	0(%rsp), $a_ptr
-	mov	8*12(%rsp), %rcx	# n0 for mul_by_1
-	mov	8*13(%rsp), $b_ptr	# n_ptr for mul_by_1
-	mov	8*14(%rsp), $r_ptr
-	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	lea	8*15(%rsp), %r8		# size optimization
-	mov	8*15(%rsp), %r15
-.cfi_restore	%r15
-	mov	8*1(%r8), %r14
-.cfi_restore	%r14
-	mov	8*2(%r8), %r13
-.cfi_restore	%r13
-	mov	8*3(%r8), %r12
-.cfi_restore	%r12
-	mov	8*4(%r8), %rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8), %rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8), %rsp
-.cfi_adjust_cfa_offset	-8*21
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_mont_384,.-sqr_mont_384
-___
-}
-{ ########################################################## 384-bit redc_mont
-my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
-
-$code.=<<___;
-########################################################################
-# void redc_mont_384(uint64_t ret[6], const uint64_t a[12],
-#                    uint64_t m[6], uint64_t n0);
-.globl	redc_mont_384
-.hidden	redc_mont_384
-.type	redc_mont_384,\@function,4,"unwind"
-.align	32
-redc_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	redc_mont_384,.-redc_mont_384
-
-########################################################################
-# void from_mont_384(uint64_t ret[6], const uint64_t a[6],
-#                    uint64_t m[6], uint64_t n0);
-.globl	from_mont_384
-.hidden	from_mont_384
-.type	from_mont_384,\@function,4,"unwind"
-.align	32
-from_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulq_by_1_mont_384
-
-	#################################
-	# Branch-less conditional acc[0:6] - modulus
-
-	#mov	@acc[6], %rax		# __mulq_by_1_mont_384 does it
-	mov	@acc[7], %rcx
-	mov	@acc[0], %rdx
-	mov	@acc[1], %rbp
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[7]
-	mov	@acc[2], @acc[5]
-	sbb	8*2($n_ptr), @acc[0]
-	sbb	8*3($n_ptr), @acc[1]
-	sbb	8*4($n_ptr), @acc[2]
-	mov	@acc[3], $a_ptr
-	sbb	8*5($n_ptr), @acc[3]
-
-	cmovc	%rax, @acc[6]
-	cmovc	%rcx, @acc[7]
-	cmovc	%rdx, @acc[0]
-	mov	@acc[6], 8*0($r_ptr)
-	cmovc	%rbp, @acc[1]
-	mov	@acc[7], 8*1($r_ptr)
-	cmovc	@acc[5], @acc[2]
-	mov	@acc[0], 8*2($r_ptr)
-	cmovc	$a_ptr,  @acc[3]
-	mov	@acc[1], 8*3($r_ptr)
-	mov	@acc[2], 8*4($r_ptr)
-	mov	@acc[3], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	from_mont_384,.-from_mont_384
-___
-{ my @acc=@acc;				# will be rotated locally
-
-$code.=<<___;
-.type	__mulq_by_1_mont_384,\@abi-omnipotent
-.align	32
-__mulq_by_1_mont_384:
-	mov	8*0($a_ptr), %rax
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	mov	%rax, @acc[6]
-	imulq	$n0, %rax
-	mov	%rax, @acc[0]
-___
-for (my $i=0; $i<6; $i++) {
-my $hi = @acc[6];
-$code.=<<___;
-	################################# reduction $i
-	mulq	8*0($n_ptr)
-	add	%rax, @acc[6]		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, @acc[6]
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[6], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	add	%rax, @acc[3]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-___
-$code.=<<___	if ($i<5);
-	 mov	@acc[1], @acc[7]
-	 imulq	$n0, @acc[1]
-___
-$code.=<<___;
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*4($n_ptr)
-	add	%rax, @acc[4]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*5($n_ptr)
-	add	%rax, @acc[5]
-	mov	@acc[1], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[5]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	ret
-.size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
-
-.type	__redc_tail_mont_384,\@abi-omnipotent
-.align	32
-__redc_tail_mont_384:
-	add	8*6($a_ptr), @acc[0]	# accumulate upper half
-	mov	@acc[0], %rax
-	adc	8*7($a_ptr), @acc[1]
-	adc	8*8($a_ptr), @acc[2]
-	adc	8*9($a_ptr), @acc[3]
-	mov	@acc[1], %rcx
-	adc	8*10($a_ptr), @acc[4]
-	adc	8*11($a_ptr), @acc[5]
-	sbb	@acc[6], @acc[6]
-
-	#################################
-	# Branch-less conditional acc[0:6] - modulus
-
-	mov	@acc[2], %rdx
-	mov	@acc[3], %rbp
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	mov	@acc[4], @acc[7]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	mov	@acc[5], $a_ptr
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, @acc[6]
-
-	cmovc	%rax, @acc[0]
-	cmovc	%rcx, @acc[1]
-	cmovc	%rdx, @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	%rbp, @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[7], @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	$a_ptr,  @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__redc_tail_mont_384,.-__redc_tail_mont_384
-
-.globl	sgn0_pty_mont_384
-.hidden	sgn0_pty_mont_384
-.type	sgn0_pty_mont_384,\@function,3,"unwind"
-.align	32
-sgn0_pty_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$a_ptr, $n_ptr
-	lea	0($r_ptr), $a_ptr
-	mov	$b_org, $n0
-	call	__mulq_by_1_mont_384
-
-	xor	%rax, %rax
-	mov	@acc[0], @acc[7]
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	not	%rax			# 2*x > p, which means "negative"
-	and	\$1, @acc[7]
-	and	\$2, %rax
-	or	@acc[7], %rax		# pack sign and parity
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0_pty_mont_384,.-sgn0_pty_mont_384
-
-.globl	sgn0_pty_mont_384x
-.hidden	sgn0_pty_mont_384x
-.type	sgn0_pty_mont_384x,\@function,3,"unwind"
-.align	32
-sgn0_pty_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$a_ptr, $n_ptr
-	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
-	mov	$b_org, $n0
-	call	__mulq_by_1_mont_384
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
-	xor	$r_ptr, $r_ptr
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, $r_ptr
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $r_ptr
-
-	mov	@acc[0], 0(%rsp)	# a->im is zero or not
-	not	$r_ptr			# 2*x > p, which means "negative"
-	and	\$1, @acc[7]
-	and	\$2, $r_ptr
-	or	@acc[7], $r_ptr		# pack sign and parity
-
-	call	__mulq_by_1_mont_384
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	xor	%rax, %rax
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	mov	0(%rsp), @acc[6]
-
-	not	%rax			# 2*x > p, which means "negative"
-
-	test	@acc[0], @acc[0]
-	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
-
-	test	@acc[6], @acc[6]
-	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
-
-	and	\$1, @acc[7]
-	and	\$2, %rax
-	or	@acc[7], %rax		# pack sign and parity
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0_pty_mont_384x,.-sgn0_pty_mont_384x
-___
-} }
-
-{ ########################################################## mulq_mont
-my ($bi, $hi) = ("%rdi", "%rbp");
-
-$code.=<<___;
-.globl	mul_mont_384
-.hidden	mul_mont_384
-.type	mul_mont_384,\@function,5,"unwind"
-.align	32
-mul_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8*3, %rsp
-.cfi_adjust_cfa_offset	8*3
-.cfi_end_prologue
-
-	mov	8*0($b_org), %rax
-	mov	8*0($a_ptr), @acc[6]
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[4]
-	mov	8*3($a_ptr), @acc[5]
-	mov	$b_org, $b_ptr		# evacuate from %rdx
-	mov	$n0,    8*0(%rsp)
-	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
-
-	call	__mulq_mont_384
-
-	mov	24(%rsp),%r15
-.cfi_restore	%r15
-	mov	32(%rsp),%r14
-.cfi_restore	%r14
-	mov	40(%rsp),%r13
-.cfi_restore	%r13
-	mov	48(%rsp),%r12
-.cfi_restore	%r12
-	mov	56(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	64(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	72(%rsp),%rsp
-.cfi_adjust_cfa_offset	-72
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mul_mont_384,.-mul_mont_384
-___
-{ my @acc=@acc;				# will be rotated locally
-
-$code.=<<___;
-.type	__mulq_mont_384,\@abi-omnipotent
-.align	32
-__mulq_mont_384:
-	mov	%rax, $bi
-	mulq	@acc[6]			# a[0]*b[0]
-	mov	%rax, @acc[0]
-	mov	$bi, %rax
-	mov	%rdx, @acc[1]
-
-	mulq	@acc[7]			# a[1]*b[0]
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[2]
-
-	mulq	@acc[4]			# a[2]*b[0]
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[3]
-
-	 mov	@acc[0], $hi
-	 imulq	8(%rsp), @acc[0]
-
-	mulq	@acc[5]			# a[3]*b[0]
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[4]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[5]
-
-	mulq	8*5($a_ptr)
-	add	%rax, @acc[5]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	xor	@acc[7], @acc[7]
-	mov	%rdx, @acc[6]
-___
-for (my $i=0; $i<6;) {
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
-$code.=<<___;
-	################################# reduction $i
-	mulq	8*0($n_ptr)
-	add	%rax, $hi		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, $hi
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	add	%rax, @acc[3]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*4($n_ptr)
-	add	%rax, @acc[4]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*5($n_ptr)
-	add	%rax, @acc[5]
-	mov	$b_next, %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[5]
-	adc	%rdx, @acc[6]
-	adc	\$0, @acc[7]
-___
-    push(@acc,shift(@acc));
-$code.=<<___	if ($i++<5);
-	################################# Multiply by b[$i]
-	mov	%rax, $bi
-	mulq	8*0($a_ptr)
-	add	%rax, @acc[0]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*1($a_ptr)
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*2($a_ptr)
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	 mov	@acc[0], $hi
-	 imulq	8(%rsp), @acc[0]
-
-	mulq	8*3($a_ptr)
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[4]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*5($a_ptr)
-	add	@acc[7], @acc[5]
-	adc	\$0, %rdx
-	xor	@acc[7], @acc[7]
-	add	%rax, @acc[5]
-	mov	@acc[0], %rax
-	adc	%rdx, @acc[6]
-	adc	\$0, @acc[7]
-___
-}
-$code.=<<___;
-	#################################
-	# Branch-less conditional acc[0:6] - modulus
-
-	#mov	@acc[0], %rax
-	mov	8*2(%rsp), $r_ptr	# restore $r_ptr
-	sub	8*0($n_ptr), @acc[0]
-	mov	@acc[1], %rdx
-	sbb	8*1($n_ptr), @acc[1]
-	mov	@acc[2], $b_ptr
-	sbb	8*2($n_ptr), @acc[2]
-	mov	@acc[3], $a_ptr
-	sbb	8*3($n_ptr), @acc[3]
-	mov	@acc[4], $hi
-	sbb	8*4($n_ptr), @acc[4]
-	mov	@acc[5], @acc[7]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, @acc[6]
-
-	cmovc	%rax,    @acc[0]
-	cmovc	%rdx,    @acc[1]
-	cmovc	$b_ptr,  @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	$a_ptr,  @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	$hi,     @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	@acc[7], @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__mulq_mont_384,.-__mulq_mont_384
-___
-} }
-$code.=<<___;
-.globl	sqr_n_mul_mont_384
-.hidden	sqr_n_mul_mont_384
-.type	sqr_n_mul_mont_384,\@function,6,"unwind"
-.align	32
-sqr_n_mul_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8*17, %rsp
-.cfi_adjust_cfa_offset	8*17
-.cfi_end_prologue
-
-	mov	$n0,    8*0(%rsp)
-	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
-	mov	$n_ptr, 8*2(%rsp)
-	lea	8*4(%rsp), $r_ptr
-	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
-	movq	(%r9), %xmm2		# prefetch b[0]
-
-.Loop_sqr_384:
-	movd	%edx, %xmm1		# loop counter
-
-	call	__sqrq_384
-
-	lea	0($r_ptr), $a_ptr
-	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
-	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
-	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	movd	%xmm1, %edx
-	lea	0($r_ptr), $a_ptr
-	dec	%edx
-	jnz	.Loop_sqr_384
-
-	movq	%xmm2, %rax		# b[0]
-	mov	$b_ptr, $n_ptr
-	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
-
-	#mov	8*0($b_ptr), %rax
-	#mov	8*0($a_ptr), @acc[6]
-	#mov	8*1($a_ptr), @acc[7]
-	#mov	8*2($a_ptr), @acc[4]
-	#mov	8*3($a_ptr), @acc[5]
-	mov	@acc[0], @acc[4]
-	mov	@acc[1], @acc[5]
-
-	call	__mulq_mont_384
-
-	lea	8*17(%rsp), %r8		# size optimization
-	mov	8*17(%rsp), %r15
-.cfi_restore	%r15
-	mov	8*1(%r8), %r14
-.cfi_restore	%r14
-	mov	8*2(%r8), %r13
-.cfi_restore	%r13
-	mov	8*3(%r8), %r12
-.cfi_restore	%r12
-	mov	8*4(%r8), %rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8), %rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8), %rsp
-.cfi_adjust_cfa_offset	-8*23
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_n_mul_mont_384,.-sqr_n_mul_mont_384
-
-.globl	sqr_n_mul_mont_383
-.hidden	sqr_n_mul_mont_383
-.type	sqr_n_mul_mont_383,\@function,6,"unwind"
-.align	32
-sqr_n_mul_mont_383:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8*17, %rsp
-.cfi_adjust_cfa_offset	8*17
-.cfi_end_prologue
-
-	mov	$n0, 8*0(%rsp)
-	mov	$r_ptr, 8*1(%rsp)	# to __mulq_mont_384
-	mov	$n_ptr, 8*2(%rsp)
-	lea	8*4(%rsp), $r_ptr
-	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
-	movq	(%r9), %xmm2		# prefetch b[0]
-
-.Loop_sqr_383:
-	movd	%edx, %xmm1		# loop counter
-
-	call	__sqrq_384
-
-	lea	0($r_ptr), $a_ptr
-	mov	8*0(%rsp), %rcx		# n0 for mul_by_1
-	mov	8*2(%rsp), $b_ptr	# n_ptr for mul_by_1
-	call	__mulq_by_1_mont_384
-
-	movd	%xmm1, %edx		# loop counter
-        add     8*6($a_ptr), @acc[6]	# just accumulate upper half
-        adc     8*7($a_ptr), @acc[7]
-        adc     8*8($a_ptr), @acc[0]
-        adc     8*9($a_ptr), @acc[1]
-        adc     8*10($a_ptr), @acc[2]
-        adc     8*11($a_ptr), @acc[3]
-	lea	0($r_ptr), $a_ptr
-
-	mov	@acc[6], 8*0($r_ptr)	# omitting full reduction gives ~5%
-	mov	@acc[7], 8*1($r_ptr)	# in addition-chains
-	mov	@acc[0], 8*2($r_ptr)
-	mov	@acc[1], 8*3($r_ptr)
-	mov	@acc[2], 8*4($r_ptr)
-	mov	@acc[3], 8*5($r_ptr)
-
-	dec	%edx
-	jnz	.Loop_sqr_383
-
-	movq	%xmm2, %rax		# b[0]
-	mov	$b_ptr, $n_ptr
-	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
-
-	#movq	8*0($b_ptr), %rax
-	#mov	8*0($a_ptr), @acc[6]
-	#mov	8*1($a_ptr), @acc[7]
-	#mov	8*2($a_ptr), @acc[4]
-	#mov	8*3($a_ptr), @acc[5]
-	mov	@acc[0], @acc[4]
-	mov	@acc[1], @acc[5]
-
-	call	__mulq_mont_384		# formally one can omit full reduction
-					# even after multiplication...
-	lea	8*17(%rsp), %r8		# size optimization
-	mov	8*17(%rsp), %r15
-.cfi_restore	%r15
-	mov	8*1(%r8), %r14
-.cfi_restore	%r14
-	mov	8*2(%r8), %r13
-.cfi_restore	%r13
-	mov	8*3(%r8), %r12
-.cfi_restore	%r12
-	mov	8*4(%r8), %rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8), %rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8), %rsp
-.cfi_adjust_cfa_offset	-8*23
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_n_mul_mont_383,.-sqr_n_mul_mont_383
-___
-{ my @acc=@acc;				# will be rotated locally
-  my $bi = "%rbp";
-
-$code.=<<___;
-.type	__mulq_mont_383_nonred,\@abi-omnipotent
-.align	32
-__mulq_mont_383_nonred:
-	mov	%rax, $bi
-	mulq	@acc[6]			# a[0]*b[0]
-	mov	%rax, @acc[0]
-	mov	$bi, %rax
-	mov	%rdx, @acc[1]
-
-	mulq	@acc[7]			# a[1]*b[0]
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[2]
-
-	mulq	@acc[4]			# a[2]*b[0]
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[3]
-
-	 mov	@acc[0], @acc[7]
-	 imulq	8(%rsp), @acc[0]
-
-	mulq	@acc[5]			# a[3]*b[0]
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[4]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[5]
-
-	mulq	8*5($a_ptr)
-	add	%rax, @acc[5]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-___
-for (my $i=0; $i<6;) {
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
-$code.=<<___;
-	################################# reduction $i
-	mulq	8*0($n_ptr)
-	add	%rax, @acc[7]		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, @acc[7]
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*2($n_ptr)
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*3($n_ptr)
-	add	@acc[7], @acc[3]
-	adc	\$0, %rdx
-	add	%rax, @acc[3]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*4($n_ptr)
-	add	%rax, @acc[4]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[7]
-
-	mulq	8*5($n_ptr)
-	add	%rax, @acc[5]
-	mov	$b_next, %rax
-	adc	\$0, %rdx
-	add	@acc[7], @acc[5]
-	adc	%rdx, @acc[6]
-___
-    push(@acc,shift(@acc));
-$code.=<<___	if ($i++<5);
-	################################# Multiply by b[$i]
-	mov	%rax, $bi
-	mulq	8*0($a_ptr)
-	add	%rax, @acc[0]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	mulq	8*1($a_ptr)
-	add	%rax, @acc[1]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[6], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	mulq	8*2($a_ptr)
-	add	%rax, @acc[2]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[6], @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	 mov	@acc[0], @acc[7]
-	 imulq	8(%rsp), @acc[0]
-
-	mulq	8*3($a_ptr)
-	add	%rax, @acc[3]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[6], @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	mulq	8*4($a_ptr)
-	add	%rax, @acc[4]
-	mov	$bi, %rax
-	adc	\$0, %rdx
-	add	@acc[6], @acc[4]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-
-	mulq	8*5($a_ptr)
-	add	@acc[6], @acc[5]
-	adc	\$0, %rdx
-	add	%rax, @acc[5]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	mov	%rdx, @acc[6]
-___
-}
-$code.=<<___;
-	ret
-.size	__mulq_mont_383_nonred,.-__mulq_mont_383_nonred
-___
-}
-{ my $frame = 4*8 +	# place for argument off-load +
-	      2*384/8 +	# place for 2 384-bit temporary vectors
-	      8;	# align
-my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
-
-# omitting 3 reductions gives 8-11% better performance in add-chains
-$code.=<<___;
-.globl	sqr_mont_382x
-.hidden	sqr_mont_382x
-.type	sqr_mont_382x,\@function,4,"unwind"
-.align	32
-sqr_mont_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$n_ptr, 8*0(%rsp)	# n0
-	mov	$b_org, $n_ptr		# n_ptr
-	mov	$a_ptr, 8*2(%rsp)
-	mov	$r_ptr, 8*3(%rsp)
-
-	#################################
-	mov	8*0($a_ptr), @acc[0]	# a->re
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	mov	@acc[0], @acc[6]
-	add	8*6($a_ptr), @acc[0]	# a->re + a->im
-	mov	@acc[1], @acc[7]
-	adc	8*7($a_ptr), @acc[1]
-	mov	@acc[2], @acc[8]
-	adc	8*8($a_ptr), @acc[2]
-	mov	@acc[3], @acc[9]
-	adc	8*9($a_ptr), @acc[3]
-	mov	@acc[4], @acc[10]
-	adc	8*10($a_ptr), @acc[4]
-	mov	@acc[5], @acc[11]
-	adc	8*11($a_ptr), @acc[5]
-
-	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
-	sbb	8*7($a_ptr), @acc[7]
-	sbb	8*8($a_ptr), @acc[8]
-	sbb	8*9($a_ptr), @acc[9]
-	sbb	8*10($a_ptr), @acc[10]
-	sbb	8*11($a_ptr), @acc[11]
-	sbb	$r_ptr, $r_ptr		# borrow flag as mask
-
-	mov	@acc[0], 32+8*0(%rsp)	# t0
-	mov	@acc[1], 32+8*1(%rsp)
-	mov	@acc[2], 32+8*2(%rsp)
-	mov	@acc[3], 32+8*3(%rsp)
-	mov	@acc[4], 32+8*4(%rsp)
-	mov	@acc[5], 32+8*5(%rsp)
-
-	mov	@acc[6], 32+8*6(%rsp)	# t1
-	mov	@acc[7], 32+8*7(%rsp)
-	mov	@acc[8], 32+8*8(%rsp)
-	mov	@acc[9], 32+8*9(%rsp)
-	mov	@acc[10], 32+8*10(%rsp)
-	mov	@acc[11], 32+8*11(%rsp)
-	mov	$r_ptr,   32+8*12(%rsp)
-
-	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
-	#mov	8*2(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_ptr	# a->im
-
-	mov	48($a_ptr), %rax	# a->im
-	mov	8*0($a_ptr), @acc[6]	# a->re
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[4]
-	mov	8*3($a_ptr), @acc[5]
-
-	mov	8*3(%rsp), $r_ptr
-	call	__mulq_mont_383_nonred
-___
-{
-my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
-                     12,13,"ax","bx","bp","si");
-$code.=<<___;
-	add	@acc[0], @acc[0]	# add with itself
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-
-	mov	@acc[0],  8*6($r_ptr)	# ret->im
-	mov	@acc[1],  8*7($r_ptr)
-	mov	@acc[2],  8*8($r_ptr)
-	mov	@acc[3],  8*9($r_ptr)
-	mov	@acc[4],  8*10($r_ptr)
-	mov	@acc[5],  8*11($r_ptr)
-___
-}
-$code.=<<___;
-	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
-	lea	32(%rsp), $a_ptr	# t0
-	lea	32+8*6(%rsp), $b_ptr	# t1
-
-	mov	32+8*6(%rsp), %rax	# t1[0]
-	mov	32+8*0(%rsp), @acc[6]	# t0[0..3]
-	mov	32+8*1(%rsp), @acc[7]
-	mov	32+8*2(%rsp), @acc[4]
-	mov	32+8*3(%rsp), @acc[5]
-
-	call	__mulq_mont_383_nonred
-___
-{
-my @acc = map("%r$_",14,15,8..11,	# output from __mulq_mont_384
-                     12,13,"ax","bx","bp","si");
-$code.=<<___;
-	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
-	mov	32+8*0(%rsp), @acc[6]
-	mov	32+8*1(%rsp), @acc[7]
-	and	@acc[11], @acc[6]
-	mov	32+8*2(%rsp), @acc[8]
-	and	@acc[11], @acc[7]
-	mov	32+8*3(%rsp), @acc[9]
-	and	@acc[11], @acc[8]
-	mov	32+8*4(%rsp), @acc[10]
-	and	@acc[11], @acc[9]
-	and	@acc[11], @acc[10]
-	and	32+8*5(%rsp), @acc[11]
-
-	sub	@acc[6], @acc[0]
-	mov	8*0($n_ptr), @acc[6]
-	sbb	@acc[7], @acc[1]
-	mov	8*1($n_ptr), @acc[7]
-	sbb	@acc[8], @acc[2]
-	mov	8*2($n_ptr), @acc[8]
-	sbb	@acc[9], @acc[3]
-	mov	8*3($n_ptr), @acc[9]
-	sbb	@acc[10], @acc[4]
-	mov	8*4($n_ptr), @acc[10]
-	sbb	@acc[11], @acc[5]
-	sbb	@acc[11], @acc[11]
-
-	and	@acc[11], @acc[6]
-	and	@acc[11], @acc[7]
-	and	@acc[11], @acc[8]
-	and	@acc[11], @acc[9]
-	and	@acc[11], @acc[10]
-	and	8*5($n_ptr), @acc[11]
-
-	add	@acc[6], @acc[0]
-	adc	@acc[7], @acc[1]
-	adc	@acc[8], @acc[2]
-	adc	@acc[9], @acc[3]
-	adc	@acc[10], @acc[4]
-	adc	@acc[11], @acc[5]
-
-	mov	@acc[0],  8*0($r_ptr)	# ret->re
-	mov	@acc[1],  8*1($r_ptr)
-	mov	@acc[2],  8*2($r_ptr)
-	mov	@acc[3],  8*3($r_ptr)
-	mov	@acc[4],  8*4($r_ptr)
-	mov	@acc[5],  8*5($r_ptr)
-___
-}
-$code.=<<___;
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqr_mont_382x,.-sqr_mont_382x
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl
deleted file mode 100755
index 0d6bf2e465c..00000000000
--- a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl
+++ /dev/null
@@ -1,486 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# "Sparse" in subroutine names refers to most significant limb of the
-# modulus. Though "sparse" is a bit of misnomer, because limitation is
-# just not-all-ones. Or in other words not larger than 2^256-2^192-1.
-# In general Montgomery multiplication algorithm can handle one of the
-# inputs being non-reduced and capped by 1<<radix_width, 1<<256 in this
-# case, rather than the modulus. Whether or not mul_mont_sparse_256, a
-# *taylored* implementation of the algorithm, can handle such input can
-# be circumstantial. For example, in most general case it depends on
-# similar "bit sparsity" of individual limbs of the second, fully reduced
-# multiplicand. If you can't make such assumption about the limbs, then
-# non-reduced value shouldn't be larger than "same old" 2^256-2^192-1.
-# This requirement can be met by conditionally subtracting "bitwise
-# left-aligned" modulus. For example, if modulus is 200 bits wide, you
-# would need to conditionally subtract the value of modulus<<56. Common
-# source of non-reduced values is redc_mont_256 treating 512-bit inputs.
-# Well, more specifically ones with upper half not smaller than modulus.
-# Just in case, why limitation at all and not general-purpose 256-bit
-# subroutines? Unlike the 384-bit case, accounting for additional carry
-# has disproportionate impact on performance, especially in adcx/adox
-# implementation.
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-{ ############################################################## 255 bits
-my @acc=map("%r$_",(10..15));
-
-{ ############################################################## mulq
-my ($lo,$hi)=("%rbp","%r9");
-
-$code.=<<___;
-.text
-
-.globl	mulx_mont_sparse_256
-.hidden	mulx_mont_sparse_256
-.type	mulx_mont_sparse_256,\@function,5,"unwind"
-.align	32
-mulx_mont_sparse_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8,%rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr		# evacuate from %rdx
-	mov	8*0($b_org), %rdx
-	mov	8*0($a_ptr), @acc[4]
-	mov	8*1($a_ptr), @acc[5]
-	mov	8*2($a_ptr), $lo
-	mov	8*3($a_ptr), $hi
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	@acc[4], %rax, @acc[1]	# a[0]*b[0]
-	call	__mulx_mont_sparse_256
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mulx_mont_sparse_256,.-mulx_mont_sparse_256
-
-.globl	sqrx_mont_sparse_256
-.hidden	sqrx_mont_sparse_256
-.type	sqrx_mont_sparse_256,\@function,4,"unwind"
-.align	32
-sqrx_mont_sparse_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8,%rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$a_ptr, $b_ptr
-	mov	$n_ptr, $n0
-	mov	$b_org, $n_ptr
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[5]
-	mov	8*2($a_ptr), $lo
-	mov	8*3($a_ptr), $hi
-	lea	-128($b_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%rdx, %rax, @acc[1]	# a[0]*a[0]
-	call	__mulx_mont_sparse_256
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_mont_sparse_256,.-sqrx_mont_sparse_256
-___
-{
-my @acc=@acc;
-$code.=<<___;
-.type	__mulx_mont_sparse_256,\@abi-omnipotent
-.align	32
-__mulx_mont_sparse_256:
-	mulx	@acc[5], @acc[5], @acc[2]
-	mulx	$lo, $lo, @acc[3]
-	add	@acc[5], @acc[1]
-	mulx	$hi, $hi, @acc[4]
-	 mov	8($b_ptr), %rdx
-	adc	$lo, @acc[2]
-	adc	$hi, @acc[3]
-	adc	\$0, @acc[4]
-
-___
-for (my $i=1; $i<4; $i++) {
-my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : "%rax";
-my $a5 = $i==1 ? @acc[5] : $lo;
-$code.=<<___;
-	 mov	%rax, @acc[0]
-	 imulq	$n0, %rax
-
-	################################# Multiply by b[$i]
-	xor	$a5, $a5		# [@acc[5]=0,] cf=0, of=0
-	mulx	8*0+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[1]
-	adcx	$hi, @acc[2]
-
-	mulx	8*1+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[2]
-	adcx	$hi, @acc[3]
-
-	mulx	8*2+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[3]
-	adcx	$hi, @acc[4]
-
-	mulx	8*3+128($a_ptr), $lo, $hi
-	 mov	%rax, %rdx
-	adox	$lo, @acc[4]
-	adcx	@acc[5], $hi 		# cf=0
-	adox	$hi, @acc[5]		# of=0
-
-	################################# reduction
-	mulx	8*0+128($n_ptr), $lo, %rax
-	adcx	$lo, @acc[0]		# guaranteed to be zero
-	adox	@acc[1], %rax
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, %rax		# @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	 mov	$b_next, %rdx
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-	adcx	@acc[0], @acc[4]
-	adox	@acc[0], @acc[5]
-	adcx	@acc[0], @acc[5]
-	adox	@acc[0], @acc[0]	# acc[5] in next iteration
-	adc	\$0, @acc[0]		# cf=0, of=0
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	imulq	$n0, %rdx
-
-	################################# last reduction
-	xor	$lo, $lo		# cf=0, of=0
-	mulx	8*0+128($n_ptr), @acc[0], $hi
-	adcx	%rax, @acc[0]		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	 mov	@acc[1], %rdx
-	 lea	128($n_ptr), $n_ptr
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-	 mov	@acc[2], %rax
-	adcx	@acc[0], @acc[4]
-	adox	@acc[0], @acc[5]
-	adc	\$0, @acc[5]
-
-	#################################
-	# Branch-less conditional acc[1:5] - modulus
-
-	 mov	@acc[3], $lo
-	sub	8*0($n_ptr), @acc[1]
-	sbb	8*1($n_ptr), @acc[2]
-	sbb	8*2($n_ptr), @acc[3]
-	 mov	@acc[4], $hi
-	sbb	8*3($n_ptr), @acc[4]
-	sbb	\$0, @acc[5]
-
-	cmovc	%rdx, @acc[1]
-	cmovc	%rax, @acc[2]
-	cmovc	$lo,  @acc[3]
-	mov	@acc[1], 8*0($r_ptr)
-	cmovc	$hi,  @acc[4]
-	mov	@acc[2], 8*1($r_ptr)
-	mov	@acc[3], 8*2($r_ptr)
-	mov	@acc[4], 8*3($r_ptr)
-
-	ret
-.size	__mulx_mont_sparse_256,.-__mulx_mont_sparse_256
-___
-} }
-{ my ($n_ptr, $n0)=($b_ptr, $n_ptr);	# arguments are "shifted"
-
-$code.=<<___;
-.globl	fromx_mont_256
-.hidden	fromx_mont_256
-.type	fromx_mont_256,\@function,4,"unwind"
-.align	32
-fromx_mont_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulx_by_1_mont_256
-
-	#################################
-	# Branch-less conditional acc[0:3] - modulus
-
-	#mov	@acc[4], %rax		# __mulq_by_1_mont_256 does it
-	mov	@acc[5], %rdx
-	mov	@acc[0], @acc[2]
-	mov	@acc[1], @acc[3]
-
-	sub	8*0($n_ptr), @acc[4]
-	sbb	8*1($n_ptr), @acc[5]
-	sbb	8*2($n_ptr), @acc[0]
-	sbb	8*3($n_ptr), @acc[1]
-
-	cmovnc	@acc[4], %rax
-	cmovnc	@acc[5], %rdx
-	cmovnc	@acc[0], @acc[2]
-	mov	%rax,    8*0($r_ptr)
-	cmovnc	@acc[1], @acc[3]
-	mov	%rdx,    8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	fromx_mont_256,.-fromx_mont_256
-
-.globl	redcx_mont_256
-.hidden	redcx_mont_256
-.type	redcx_mont_256,\@function,4,"unwind"
-.align	32
-redcx_mont_256:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulx_by_1_mont_256
-
-	add	8*4($a_ptr), @acc[4]	# accumulate upper half
-	adc	8*5($a_ptr), @acc[5]
-	mov	@acc[4], %rax
-	adc	8*6($a_ptr), @acc[0]
-	mov	@acc[5], %rdx
-	adc	8*7($a_ptr), @acc[1]
-	sbb	$a_ptr, $a_ptr
-
-	#################################
-	# Branch-less conditional acc[0:4] - modulus
-
-	mov	@acc[0], @acc[2]
-	sub	8*0($n_ptr), @acc[4]
-	sbb	8*1($n_ptr), @acc[5]
-	sbb	8*2($n_ptr), @acc[0]
-	mov	@acc[1], @acc[3]
-	sbb	8*3($n_ptr), @acc[1]
-	sbb	\$0, $a_ptr
-
-	cmovnc	@acc[4], %rax 
-	cmovnc	@acc[5], %rdx
-	cmovnc	@acc[0], @acc[2]
-	mov	%rax,    8*0($r_ptr)
-	cmovnc	@acc[1], @acc[3]
-	mov	%rdx,    8*1($r_ptr)
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	redcx_mont_256,.-redcx_mont_256
-___
-{
-my @acc=@acc;
-
-$code.=<<___;
-.type	__mulx_by_1_mont_256,\@abi-omnipotent
-.align	32
-__mulx_by_1_mont_256:
-	mov	8*0($a_ptr), %rax
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-
-	mov	%rax, @acc[4]
-	imulq	$n0, %rax
-	mov	%rax, @acc[0]
-___
-for (my $i=0; $i<4; $i++) {
-my $hi = @acc[4];
-$code.=<<___;
-	################################# reduction $i
-	mulq	8*0($n_ptr)
-	add	%rax, @acc[4]		# guaranteed to be zero
-	mov	@acc[0], %rax
-	adc	%rdx, @acc[4]
-
-	mulq	8*1($n_ptr)
-	add	%rax, @acc[1]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	@acc[4], @acc[1]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*2($n_ptr)
-___
-$code.=<<___	if ($i<3);
-	 mov	@acc[1], @acc[5]
-	 imulq	$n0, @acc[1]
-___
-$code.=<<___;
-	add	%rax, @acc[2]
-	mov	@acc[0], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[2]
-	adc	\$0, %rdx
-	mov	%rdx, $hi
-
-	mulq	8*3($n_ptr)
-	add	%rax, @acc[3]
-	mov	@acc[1], %rax
-	adc	\$0, %rdx
-	add	$hi, @acc[3]
-	adc	\$0, %rdx
-	mov	%rdx, @acc[4]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	ret
-.size	__mulx_by_1_mont_256,.-__mulx_by_1_mont_256
-___
-} } }
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/mulx_mont_384-x86_64.pl b/crypto/blst_src/asm/mulx_mont_384-x86_64.pl
deleted file mode 100755
index a7628072e17..00000000000
--- a/crypto/blst_src/asm/mulx_mont_384-x86_64.pl
+++ /dev/null
@@ -1,2384 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-# common argument layout
-($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
-$b_ptr = "%rbx";
-
-# common accumulator layout
-@acc=map("%r$_",(8..15));
-
-########################################################################
-{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr);	# all registers are affected
-						# except for $n_ptr and $r_ptr
-$code.=<<___;
-.text
-
-########################################################################
-# Double-width subtraction modulo n<<384, as opposite to naively
-# expected modulo n*n. It works because n<<384 is the actual
-# input boundary condition for Montgomery reduction, not n*n.
-# Just in case, this is duplicated, but only one module is
-# supposed to be linked...
-.type	__sub_mod_384x384,\@abi-omnipotent
-.align	32
-__sub_mod_384x384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	mov	8*6($a_ptr), @acc[6]
-
-	sub	8*0($b_org), @acc[0]
-	mov	8*7($a_ptr), @acc[7]
-	sbb	8*1($b_org), @acc[1]
-	mov	8*8($a_ptr), @acc[8]
-	sbb	8*2($b_org), @acc[2]
-	mov	8*9($a_ptr), @acc[9]
-	sbb	8*3($b_org), @acc[3]
-	mov	8*10($a_ptr), @acc[10]
-	sbb	8*4($b_org), @acc[4]
-	mov	8*11($a_ptr), @acc[11]
-	sbb	8*5($b_org), @acc[5]
-	 mov	@acc[0], 8*0($r_ptr)
-	sbb	8*6($b_org), @acc[6]
-	 mov	8*0($n_ptr), @acc[0]
-	 mov	@acc[1], 8*1($r_ptr)
-	sbb	8*7($b_org), @acc[7]
-	 mov	8*1($n_ptr), @acc[1]
-	 mov	@acc[2], 8*2($r_ptr)
-	sbb	8*8($b_org), @acc[8]
-	 mov	8*2($n_ptr), @acc[2]
-	 mov	@acc[3], 8*3($r_ptr)
-	sbb	8*9($b_org), @acc[9]
-	 mov	8*3($n_ptr), @acc[3]
-	 mov	@acc[4], 8*4($r_ptr)
-	sbb	8*10($b_org), @acc[10]
-	 mov	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], 8*5($r_ptr)
-	sbb	8*11($b_org), @acc[11]
-	 mov	8*5($n_ptr), @acc[5]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[0]
-	and	$b_org, @acc[1]
-	and	$b_org, @acc[2]
-	and	$b_org, @acc[3]
-	and	$b_org, @acc[4]
-	and	$b_org, @acc[5]
-
-	add	@acc[0], @acc[6]
-	adc	@acc[1], @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	@acc[2], @acc[8]
-	mov	@acc[7], 8*7($r_ptr)
-	adc	@acc[3], @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	@acc[4], @acc[10]
-	mov	@acc[9], 8*9($r_ptr)
-	adc	@acc[5], @acc[11]
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__sub_mod_384x384,.-__sub_mod_384x384
-
-.type	__add_mod_384,\@abi-omnipotent
-.align	32
-__add_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	add	8*0($b_org), @acc[0]
-	adc	8*1($b_org), @acc[1]
-	adc	8*2($b_org), @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	8*3($b_org), @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	8*4($b_org), @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	8*5($b_org), @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$b_org, $b_org
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $b_org
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	@acc[9],  @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__add_mod_384,.-__add_mod_384
-
-.type	__sub_mod_384,\@abi-omnipotent
-.align	32
-__sub_mod_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-__sub_mod_384_a_is_loaded:
-	sub	8*0($b_org), @acc[0]
-	 mov	8*0($n_ptr), @acc[6]
-	sbb	8*1($b_org), @acc[1]
-	 mov	8*1($n_ptr), @acc[7]
-	sbb	8*2($b_org), @acc[2]
-	 mov	8*2($n_ptr), @acc[8]
-	sbb	8*3($b_org), @acc[3]
-	 mov	8*3($n_ptr), @acc[9]
-	sbb	8*4($b_org), @acc[4]
-	 mov	8*4($n_ptr), @acc[10]
-	sbb	8*5($b_org), @acc[5]
-	 mov	8*5($n_ptr), @acc[11]
-	sbb	$b_org, $b_org
-
-	and	$b_org, @acc[6]
-	and	$b_org, @acc[7]
-	and	$b_org, @acc[8]
-	and	$b_org, @acc[9]
-	and	$b_org, @acc[10]
-	and	$b_org, @acc[11]
-
-	add	@acc[6], @acc[0]
-	adc	@acc[7], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[8], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[9], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	adc	@acc[10], @acc[4]
-	mov	@acc[3], 8*3($r_ptr)
-	adc	@acc[11], @acc[5]
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__sub_mod_384,.-__sub_mod_384
-___
-}
-
-########################################################################
-# "Complex" multiplication and squaring. Use vanilla multiplication when
-# possible to fold reductions. I.e. instead of mul_mont, mul_mont
-# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod
-# followed by *common* reduction... For single multiplication disjoint
-# reduction is bad for performance for given vector length, yet overall
-# it's a win here, because it's one reduction less.
-{ my $frame = 5*8 +	# place for argument off-load +
-	      3*768/8;	# place for 3 768-bit temporary vectors
-$code.=<<___;
-.globl	mulx_mont_384x
-.hidden	mulx_mont_384x
-.type	mulx_mont_384x,\@function,5,"unwind"
-.align	32
-mulx_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr
-	mov	$r_ptr, 8*4(%rsp)	# offload arguments
-	mov	$a_ptr, 8*3(%rsp)
-	mov	$b_org, 8*2(%rsp)
-	mov	$n_ptr, 8*1(%rsp)
-	mov	$n0,    8*0(%rsp)
-
-	################################# mul_384(t0, a->re, b->re);
-	#lea	0($b_btr), $b_ptr	# b->re
-	#lea	0($a_ptr), $a_ptr	# a->re
-	lea	40(%rsp), $r_ptr	# t0
-	call	__mulx_384
-
-	################################# mul_384(t1, a->im, b->im);
-	lea	48($b_ptr), $b_ptr	# b->im
-	lea	128+48($a_ptr), $a_ptr	# a->im
-	lea	96($r_ptr), $r_ptr	# t1
-	call	__mulx_384
-
-	################################# mul_384(t2, a->re+a->im, b->re+b->im);
-	mov	8*1(%rsp), $n_ptr
-	lea	($b_ptr), $a_ptr	# b->re
-	lea	-48($b_ptr), $b_org	# b->im
-	lea	40+192+48(%rsp), $r_ptr
-	call	__add_mod_384
-
-	mov	8*3(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_org	# a->im
-	lea	-48($r_ptr), $r_ptr
-	call	__add_mod_384
-
-	lea	($r_ptr),$b_ptr
-	lea	48($r_ptr),$a_ptr
-	call	__mulx_384
-
-	################################# t2=t2-t0-t1
-	lea	($r_ptr), $a_ptr	# t2
-	lea	40(%rsp), $b_org	# t0
-	mov	8*1(%rsp), $n_ptr
-	call	__sub_mod_384x384	# t2-t0
-
-	lea	($r_ptr), $a_ptr	# t2
-	lea	-96($r_ptr), $b_org	# t1
-	call	__sub_mod_384x384	# t2-t0-t1
-
-	################################# t0=t0-t1
-	lea	40(%rsp), $a_ptr
-	lea	40+96(%rsp), $b_org
-	lea	40(%rsp), $r_ptr
-	call	__sub_mod_384x384	# t0-t1
-
-	lea	($n_ptr), $b_ptr	# n_ptr for redc_mont_384
-
-	################################# redc_mont_384(ret->re, t0, mod, n0);
-	lea	40(%rsp), $a_ptr	# t0
-	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
-	mov	8*4(%rsp), $r_ptr	# ret->re
-	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	################################# redc_mont_384(ret->im, t2, mod, n0);
-	lea	40+192(%rsp), $a_ptr	# t2
-	mov	8*0(%rsp), %rcx		# n0 for redc_mont_384
-	lea	48($r_ptr), $r_ptr	# ret->im
-	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mulx_mont_384x,.-mulx_mont_384x
-___
-}
-{ my $frame = 4*8 +	# place for argument off-load +
-	      2*384/8 +	# place for 2 384-bit temporary vectors
-	      8;	# alignment
-$code.=<<___;
-.globl	sqrx_mont_384x
-.hidden	sqrx_mont_384x
-.type	sqrx_mont_384x,\@function,4,"unwind"
-.align	32
-sqrx_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$n_ptr, 8*0(%rsp)	# n0
-	mov	$b_org, $n_ptr		# n_ptr
-					# gap for __mulx_mont_384
-	mov	$r_ptr, 8*2(%rsp)
-	mov	$a_ptr, 8*3(%rsp)
-
-	################################# add_mod_384(t0, a->re, a->im);
-	lea	48($a_ptr), $b_org	# a->im
-	lea	32(%rsp), $r_ptr	# t0
-	call	__add_mod_384
-
-	################################# sub_mod_384(t1, a->re, a->im);
-	mov	8*3(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_org	# a->im
-	lea	32+48(%rsp), $r_ptr	# t1
-	call	__sub_mod_384
-
-	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
-	mov	8*3(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_ptr	# a->im
-
-	mov	48($a_ptr), %rdx
-	mov	8*0($a_ptr), %r14	# @acc[6]
-	mov	8*1($a_ptr), %r15	# @acc[7]
-	mov	8*2($a_ptr), %rax	# @acc[8]
-	mov	8*3($a_ptr), %r12	# @acc[4]
-	mov	8*4($a_ptr), %rdi	# $lo
-	mov	8*5($a_ptr), %rbp	# $hi
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%r14, %r8, %r9
-	call	__mulx_mont_384
-___
-{
-my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
-                      8..11,13,14);
-$code.=<<___;
-	add	@acc[0], @acc[0]	# add with itself
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	 mov	@acc[0], @acc[6]
-	adc	@acc[3], @acc[3]
-	 mov	@acc[1], @acc[7]
-	adc	@acc[4], @acc[4]
-	 mov	@acc[2], @acc[8]
-	adc	@acc[5], @acc[5]
-	 mov	@acc[3], @acc[9]
-	sbb	$a_ptr, $a_ptr
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	 mov	@acc[4], @acc[10]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	 mov	@acc[5], @acc[11]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $a_ptr
-
-	cmovc	@acc[6],  @acc[0]
-	cmovc	@acc[7],  @acc[1]
-	cmovc	@acc[8],  @acc[2]
-	mov	@acc[0], 8*6($b_ptr)	# ret->im
-	cmovc	@acc[9],  @acc[3]
-	mov	@acc[1], 8*7($b_ptr)
-	cmovc	@acc[10], @acc[4]
-	mov	@acc[2], 8*8($b_ptr)
-	cmovc	@acc[11], @acc[5]
-	mov	@acc[3], 8*9($b_ptr)
-	mov	@acc[4], 8*10($b_ptr)
-	mov	@acc[5], 8*11($b_ptr)
-___
-}
-$code.=<<___;
-	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
-	lea	32(%rsp), $a_ptr	# t0
-	lea	32+48(%rsp), $b_ptr	# t1
-
-	mov	32+48(%rsp), %rdx	# t1[0]
-	mov	32+8*0(%rsp), %r14	# @acc[6]
-	mov	32+8*1(%rsp), %r15	# @acc[7]
-	mov	32+8*2(%rsp), %rax	# @acc[8]
-	mov	32+8*3(%rsp), %r12	# @acc[4]
-	mov	32+8*4(%rsp), %rdi	# $lo
-	mov	32+8*5(%rsp), %rbp	# $hi
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%r14, %r8, %r9
-	call	__mulx_mont_384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_mont_384x,.-sqrx_mont_384x
-
-.globl	mulx_382x
-.hidden	mulx_382x
-.type	mulx_382x,\@function,4,"unwind"
-.align	32
-mulx_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	lea	96($r_ptr), $r_ptr	# ret->im
-	mov	$a_ptr, 8*0(%rsp)
-	mov	$b_org, 8*1(%rsp)
-	mov	$r_ptr, 8*2(%rsp)	# offload ret->im
-	mov	$n_ptr, 8*3(%rsp)
-
-	################################# t0 = a->re + a->im
-	mov	8*0($a_ptr), @acc[0]
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	add	8*6($a_ptr), @acc[0]
-	adc	8*7($a_ptr), @acc[1]
-	adc	8*8($a_ptr), @acc[2]
-	adc	8*9($a_ptr), @acc[3]
-	adc	8*10($a_ptr), @acc[4]
-	adc	8*11($a_ptr), @acc[5]
-
-	mov	@acc[0], 32+8*0(%rsp)
-	mov	@acc[1], 32+8*1(%rsp)
-	mov	@acc[2], 32+8*2(%rsp)
-	mov	@acc[3], 32+8*3(%rsp)
-	mov	@acc[4], 32+8*4(%rsp)
-	mov	@acc[5], 32+8*5(%rsp)
-
-	################################# t1 = b->re + b->im
-	mov	8*0($b_org), @acc[0]
-	mov	8*1($b_org), @acc[1]
-	mov	8*2($b_org), @acc[2]
-	mov	8*3($b_org), @acc[3]
-	mov	8*4($b_org), @acc[4]
-	mov	8*5($b_org), @acc[5]
-
-	add	8*6($b_org), @acc[0]
-	adc	8*7($b_org), @acc[1]
-	adc	8*8($b_org), @acc[2]
-	adc	8*9($b_org), @acc[3]
-	adc	8*10($b_org), @acc[4]
-	adc	8*11($b_org), @acc[5]
-
-	mov	@acc[0], 32+8*6(%rsp)
-	mov	@acc[1], 32+8*7(%rsp)
-	mov	@acc[2], 32+8*8(%rsp)
-	mov	@acc[3], 32+8*9(%rsp)
-	mov	@acc[4], 32+8*10(%rsp)
-	mov	@acc[5], 32+8*11(%rsp)
-
-	################################# mul_384(ret->im, t0, t1);
-	lea	32+8*0(%rsp), $a_ptr	# t0
-	lea	32+8*6(%rsp), $b_ptr	# t1
-	call	__mulx_384
-
-	################################# mul_384(ret->re, a->re, b->re);
-	mov	8*0(%rsp), $a_ptr
-	mov	8*1(%rsp), $b_ptr
-	lea	-96($r_ptr), $r_ptr	# ret->re
-	call	__mulx_384
-
-	################################# mul_384(tx, a->im, b->im);
-	lea	48+128($a_ptr), $a_ptr
-	lea	48($b_ptr), $b_ptr
-	lea	32(%rsp), $r_ptr
-	call	__mulx_384
-
-	################################# ret->im -= tx
-	mov	8*2(%rsp), $a_ptr	# restore ret->im
-	lea	32(%rsp), $b_org
-	mov	8*3(%rsp), $n_ptr
-	mov	$a_ptr, $r_ptr
-	call	__sub_mod_384x384
-
-	################################# ret->im -= ret->re
-	lea	0($r_ptr), $a_ptr
-	lea	-96($r_ptr), $b_org
-	call	__sub_mod_384x384
-
-	################################# ret->re -= tx
-	lea	-96($r_ptr), $a_ptr
-	lea	32(%rsp), $b_org
-	lea	-96($r_ptr), $r_ptr
-	call	__sub_mod_384x384
-
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mulx_382x,.-mulx_382x
-___
-}
-{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org);	# all registers are affected
-						# except for $n_ptr and $r_ptr
-$code.=<<___;
-.globl	sqrx_382x
-.hidden	sqrx_382x
-.type	sqrx_382x,\@function,3,"unwind"
-.align	32
-sqrx_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$a_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-
-	################################# t0 = a->re + a->im
-	mov	8*0($a_ptr), @acc[6]
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[9]
-	mov	8*4($a_ptr), @acc[10]
-	mov	8*5($a_ptr), @acc[11]
-
-	mov	@acc[6], @acc[0]
-	add	8*6($a_ptr), @acc[6]
-	mov	@acc[7], @acc[1]
-	adc	8*7($a_ptr), @acc[7]
-	mov	@acc[8], @acc[2]
-	adc	8*8($a_ptr), @acc[8]
-	mov	@acc[9], @acc[3]
-	adc	8*9($a_ptr), @acc[9]
-	mov	@acc[10], @acc[4]
-	adc	8*10($a_ptr), @acc[10]
-	mov	@acc[11], @acc[5]
-	adc	8*11($a_ptr), @acc[11]
-
-	mov	@acc[6], 8*0($r_ptr)
-	mov	@acc[7], 8*1($r_ptr)
-	mov	@acc[8], 8*2($r_ptr)
-	mov	@acc[9], 8*3($r_ptr)
-	mov	@acc[10], 8*4($r_ptr)
-	mov	@acc[11], 8*5($r_ptr)
-
-	################################# t1 = a->re - a->im
-	lea	48($a_ptr), $b_org
-	lea	48($r_ptr), $r_ptr
-	call	__sub_mod_384_a_is_loaded
-
-	################################# mul_384(ret->re, t0, t1);
-	lea	($r_ptr), $a_ptr
-	lea	-48($r_ptr), $b_ptr
-	lea	-48($r_ptr), $r_ptr
-	call	__mulx_384
-
-	################################# mul_384(ret->im, a->re, a->im);
-	mov	(%rsp), $a_ptr
-	lea	48($a_ptr), $b_ptr
-	lea	96($r_ptr), $r_ptr
-	call	__mulx_384
-
-	mov	8*0($r_ptr), @acc[0]	# double ret->im
-	mov	8*1($r_ptr), @acc[1]
-	mov	8*2($r_ptr), @acc[2]
-	mov	8*3($r_ptr), @acc[3]
-	mov	8*4($r_ptr), @acc[4]
-	mov	8*5($r_ptr), @acc[5]
-	mov	8*6($r_ptr), @acc[6]
-	mov	8*7($r_ptr), @acc[7]
-	mov	8*8($r_ptr), @acc[8]
-	mov	8*9($r_ptr), @acc[9]
-	mov	8*10($r_ptr), @acc[10]
-	add	@acc[0], @acc[0]
-	mov	8*11($r_ptr), @acc[11]
-	adc	@acc[1], @acc[1]
-	mov	@acc[0], 8*0($r_ptr)
-	adc	@acc[2], @acc[2]
-	mov	@acc[1], 8*1($r_ptr)
-	adc	@acc[3], @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	adc	@acc[4], @acc[4]
-	mov	@acc[3], 8*3($r_ptr)
-	adc	@acc[5], @acc[5]
-	mov	@acc[4], 8*4($r_ptr)
-	adc	@acc[6], @acc[6]
-	mov	@acc[5], 8*5($r_ptr)
-	adc	@acc[7], @acc[7]
-	mov	@acc[6], 8*6($r_ptr)
-	adc	@acc[8], @acc[8]
-	mov	@acc[7], 8*7($r_ptr)
-	adc	@acc[9], @acc[9]
-	mov	@acc[8], 8*8($r_ptr)
-	adc	@acc[10], @acc[10]
-	mov	@acc[9], 8*9($r_ptr)
-	adc	@acc[11], @acc[11]
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	mov	8*1(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*2(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*3(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*4(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*5(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*6(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*7(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*7
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_382x,.-sqrx_382x
-___
-}
-{ ########################################################## 384-bit mulx
-my ($a0, $a1) = @acc[6..7];
-my @acc = @acc[0..5];
-my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp");
-
-$code.=<<___;
-.globl	mulx_384
-.hidden	mulx_384
-.type	mulx_384,\@function,3,"unwind"
-.align	32
-mulx_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr		# evacuate from %rdx
-	call	__mulx_384
-
-	mov	0(%rsp),%r15
-.cfi_restore	%r15
-	mov	8(%rsp),%r14
-.cfi_restore	%r14
-	mov	16(%rsp),%r13
-.cfi_restore	%r13
-	mov	24(%rsp),%r12
-.cfi_restore	%r12
-	mov	32(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	40(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	48(%rsp),%rsp
-.cfi_adjust_cfa_offset	-48
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mulx_384,.-mulx_384
-
-.type	__mulx_384,\@abi-omnipotent
-.align	32
-__mulx_384:
-	mov	8*0($b_ptr), %rdx
-	mov	8*0($a_ptr), $a0
-	mov	8*1($a_ptr), $a1
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-	lea	-128($a_ptr), $a_ptr
-
-	mulx	$a0, @acc[1], $hi
-	xor	$zr, $zr
-
-	mulx	$a1, @acc[0], $lo
-	adcx	$hi, @acc[0]
-	mov	@acc[1], 8*0($r_ptr)
-
-	mulx	@acc[2], @acc[1], $hi
-	adcx	$lo, @acc[1]
-
-	mulx	@acc[3], @acc[2], $lo
-	adcx	$hi, @acc[2]
-
-	mulx	@acc[4], @acc[3], $hi
-	adcx	$lo, @acc[3]
-
-	mulx	@acc[5], @acc[4], @acc[5]
-	mov	8*1($b_ptr), %rdx
-	adcx	$hi, @acc[4]
-	adcx	$zr, @acc[5]
-___
-for(my $i=1; $i<6; $i++) {
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax";
-$code.=<<___;
-	mulx	$a0, $lo, $hi
-	adcx	@acc[0], $lo
-	adox	$hi, @acc[1]
-	mov	$lo, 8*$i($r_ptr)
-
-	mulx	$a1, @acc[0], $hi
-	adcx	@acc[1], $acc[0]
-	adox	$hi, @acc[2]
-
-	mulx	128+8*2($a_ptr), @acc[1], $lo
-	adcx	@acc[2], @acc[1]
-	adox	$lo, @acc[3]
-
-	mulx	128+8*3($a_ptr), @acc[2], $hi
-	adcx	@acc[3], @acc[2]
-	adox	$hi, @acc[4]
-
-	mulx	128+8*4($a_ptr), @acc[3], $lo
-	adcx	@acc[4], @acc[3]
-	adox	@acc[5], $lo
-
-	mulx	128+8*5($a_ptr), @acc[4], @acc[5]
-	mov	$b_next, %rdx
-	adcx	$lo, @acc[4]
-	adox	$zr, @acc[5]
-	adcx	$zr, @acc[5]
-___
-}
-$code.=<<___;
-	mov	@acc[0], 8*6($r_ptr)
-	mov	@acc[1], 8*7($r_ptr)
-	mov	@acc[2], 8*8($r_ptr)
-	mov	@acc[3], 8*9($r_ptr)
-	mov	@acc[4], 8*10($r_ptr)
-	mov	@acc[5], 8*11($r_ptr)
-
-	ret
-.size	__mulx_384,.-__mulx_384
-___
-}
-{ ########################################################## 384-bit sqrx
-$code.=<<___;
-.globl	sqrx_384
-.hidden	sqrx_384
-.type	sqrx_384,\@function,2,"unwind"
-.align	32
-sqrx_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	$r_ptr
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	call	__sqrx_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_384,.-sqrx_384
-___
-if (0) {
-# up to 5% slower than below variant
-my @acc=map("%r$_",("no",8..15,"cx","bx"));
-   push(@acc, $a_ptr);
-my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno");
-
-$code.=<<___;
-.type	__sqrx_384,\@abi-omnipotent
-.align	32
-__sqrx_384:
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[9]
-	mov	8*4($a_ptr), @acc[10]
-
-	#########################################
-	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
-	 mov	8*5($a_ptr), @acc[11]
-	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
-	add	$lo, @acc[2]
-	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
-	adc	$hi, @acc[3]
-	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
-	adc	$lo, @acc[4]
-	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
-	adc	$hi, @acc[5]
-	adc	\$0, @acc[6]
-
-	mulx	%rdx, $lo, $hi			# a[0]*a[0]
-	 mov	@acc[7], %rdx
-	xor	@acc[7], @acc[7]
-	add	@acc[1], @acc[1]		# double acc[1]
-	adc	\$0, @acc[7]
-	add	$hi, @acc[1]
-	adc	\$0, @acc[7]
-	mov	$lo, 8*0($r_ptr)
-	mov	@acc[1], 8*1($r_ptr)
-___
-($carry, @acc[7]) = (@acc[7], @acc[1]);
-$code.=<<___;
-	#########################################
-	xor	@acc[7], @acc[7]
-	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
-	adcx	$lo, @acc[6]
-	adox	@acc[7], $hi
-	adcx	$hi, @acc[7]
-
-	mulx	%rdx, $lo, $hi			# a[1]*a[1]
-	 mov	@acc[8], %rdx
-	xor	@acc[8], @acc[8]
-	adox	@acc[2], @acc[2]		# double acc[2:3]
-	adcx	$carry, $lo			# can't carry
-	adox	@acc[3], @acc[3]
-	adcx	$lo, @acc[2]
-	adox	@acc[8], @acc[8]
-	adcx	$hi, @acc[3]
-	adc	\$0, @acc[8]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-___
-($carry,@acc[8])=(@acc[8],$carry);
-$code.=<<___;
-	#########################################
-	xor	@acc[8], @acc[8]
-	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-
-	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
-	adcx	$lo, @acc[6]
-	adox	$hi, @acc[7]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
-	adcx	$lo, @acc[7]
-	adox	@acc[8], $hi
-	adcx	$hi, @acc[8]
-
-	mulx	%rdx, $lo, $hi			# a[2]*a[2]
-	 mov	@acc[9], %rdx
-	xor	@acc[9], @acc[9]
-	adox	@acc[4], @acc[4]		# double acc[4:5]
-	adcx	$carry, $lo			# can't carry
-	adox	@acc[5], @acc[5]
-	adcx	$lo, @acc[4]
-	adox	@acc[9], @acc[9]
-	adcx	$hi, @acc[5]
-	adc	\$0, $acc[9]
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-___
-($carry,@acc[9])=(@acc[9],$carry);
-$code.=<<___;
-	#########################################
-	xor	@acc[9], @acc[9]
-	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
-	adcx	$lo, @acc[7]
-	adox	$hi, @acc[8]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
-	adcx	$lo, @acc[8]
-	adox	@acc[9], $hi
-	adcx	$hi, @acc[9]
-
-	mulx	%rdx, $lo, $hi
-	 mov	@acc[10], %rdx
-	xor	@acc[10], @acc[10]
-	adox	@acc[6], @acc[6]		# double acc[6:7]
-	adcx	$carry, $lo			# can't carry
-	adox	@acc[7], @acc[7]
-	adcx	$lo, @acc[6]
-	adox	@acc[10], @acc[10]
-	adcx	$hi, @acc[7]
-	adc	\$0, $acc[10]
-	mov	@acc[6], 8*6($r_ptr)
-	mov	@acc[7], 8*7($r_ptr)
-___
-($carry,@acc[10])=(@acc[10],$carry);
-$code.=<<___;
-	#########################################
-	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
-	add	$lo, @acc[9]
-	adc	\$0, @acc[10]
-
-	mulx	%rdx, $lo, $hi			# a[4]*a[4]
-	 mov	@acc[11], %rdx
-	xor	@acc[11], @acc[11]
-	adox	@acc[8], @acc[8]		# double acc[8:10]
-	adcx	$carry, $lo			# can't carry
-	adox	@acc[9], @acc[9]
-	adcx	$lo, @acc[8]
-	adox	@acc[10], @acc[10]
-	adcx	$hi, @acc[9]
-	adox	@acc[11], @acc[11]
-	mov	@acc[8], 8*8($r_ptr)
-	mov	@acc[9], 8*9($r_ptr)
-
-	#########################################
-	mulx	%rdx, $lo, $hi			# a[5]*a[5]
-	adcx	$lo, @acc[10]
-	adcx	$hi, @acc[11]
-
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__sqrx_384,.-__sqrx_384
-___
-} else {
-my @acc=map("%r$_",("no",8..15,"cx","bx","bp"));
-my ($lo, $hi)=($r_ptr, "%rax");
-
-$code.=<<___;
-.type	__sqrx_384,\@abi-omnipotent
-.align	32
-__sqrx_384:
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[9]
-	mov	8*4($a_ptr), @acc[10]
-
-	#########################################
-	mulx	@acc[7], @acc[1], $lo		# a[1]*a[0]
-	 mov	8*5($a_ptr), @acc[11]
-	mulx	@acc[8], @acc[2], $hi		# a[2]*a[0]
-	add	$lo, @acc[2]
-	mulx	@acc[9], @acc[3], $lo		# a[3]*a[0]
-	adc	$hi, @acc[3]
-	mulx	@acc[10], @acc[4], $hi		# a[4]*a[0]
-	adc	$lo, @acc[4]
-	mulx	@acc[11], @acc[5], @acc[6]	# a[5]*a[0]
-	 mov	@acc[7], %rdx
-	adc	$hi, @acc[5]
-	adc	\$0, @acc[6]
-
-	#########################################
-	xor	@acc[7], @acc[7]
-	mulx	@acc[8], $lo, $hi		# a[2]*a[1]
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	@acc[9], $lo, $hi		# a[3]*a[1]
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	@acc[10], $lo, $hi		# a[4]*a[1]
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[1]
-	 mov	@acc[8], %rdx
-	adcx	$lo, @acc[6]
-	adox	@acc[7], $hi
-	adcx	$hi, @acc[7]
-
-	#########################################
-	xor	@acc[8], @acc[8]
-	mulx	@acc[9], $lo, $hi		# a[3]*a[2]
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-
-	mulx	@acc[10], $lo, $hi		# a[4]*a[2]
-	adcx	$lo, @acc[6]
-	adox	$hi, @acc[7]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[2]
-	 mov	@acc[9], %rdx
-	adcx	$lo, @acc[7]
-	adox	@acc[8], $hi
-	adcx	$hi, @acc[8]
-
-	#########################################
-	xor	@acc[9], @acc[9]
-	mulx	@acc[10], $lo, $hi		# a[4]*a[3]
-	adcx	$lo, @acc[7]
-	adox	$hi, @acc[8]
-
-	mulx	@acc[11], $lo, $hi		# a[5]*a[3]
-	 mov	@acc[10], %rdx
-	adcx	$lo, @acc[8]
-	adox	@acc[9], $hi
-	adcx	$hi, @acc[9]
-
-	#########################################
-	mulx	@acc[11], $lo, @acc[10]		# a[5]*a[4]
-	 mov	8*0($a_ptr), %rdx
-	add	$lo, @acc[9]
-	 mov	8(%rsp), $r_ptr			# restore $r_ptr
-	adc	\$0, @acc[10]
-
-	######################################### double acc[1:10]
-	xor	@acc[11], @acc[11]
-	adcx	@acc[1], @acc[1]
-	adcx	@acc[2], @acc[2]
-	adcx	@acc[3], @acc[3]
-	adcx	@acc[4], @acc[4]
-	adcx	@acc[5], @acc[5]
-
-	######################################### accumulate a[i]*a[i]
-	mulx	%rdx, %rdx, $hi 		# a[0]*a[0]
-	mov	%rdx, 8*0($r_ptr)
-	mov	8*1($a_ptr), %rdx
-	adox	$hi, @acc[1]
-	mov	@acc[1], 8*1($r_ptr)
-
-	mulx	%rdx, @acc[1], $hi		# a[1]*a[1]
-	mov	8*2($a_ptr), %rdx
-	adox	@acc[1], @acc[2]
-	adox	$hi,     @acc[3]
-	mov	@acc[2], 8*2($r_ptr)
-	mov	@acc[3], 8*3($r_ptr)
-
-	mulx	%rdx, @acc[1], @acc[2]		# a[2]*a[2]
-	mov	8*3($a_ptr), %rdx
-	adox	@acc[1], @acc[4]
-	adox	@acc[2], @acc[5]
-	adcx	@acc[6], @acc[6]
-	adcx	@acc[7], @acc[7]
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	mulx	%rdx, @acc[1], @acc[2]		# a[3]*a[3]
-	mov	8*4($a_ptr), %rdx
-	adox	@acc[1], @acc[6]
-	adox	@acc[2], @acc[7]
-	adcx	@acc[8], @acc[8]
-	adcx	@acc[9], @acc[9]
-	mov	@acc[6], 8*6($r_ptr)
-	mov	@acc[7], 8*7($r_ptr)
-
-	mulx	%rdx, @acc[1], @acc[2]		# a[4]*a[4]
-	mov	8*5($a_ptr), %rdx
-	adox	@acc[1], @acc[8]
-	adox	@acc[2], @acc[9]
-	adcx	@acc[10], @acc[10]
-	adcx	@acc[11], @acc[11]
-	mov	@acc[8], 8*8($r_ptr)
-	mov	@acc[9], 8*9($r_ptr)
-
-	mulx	%rdx, @acc[1], @acc[2]		# a[5]*a[5]
-	adox	@acc[1], @acc[10]
-	adox	@acc[2], @acc[11]
-
-	mov	@acc[10], 8*10($r_ptr)
-	mov	@acc[11], 8*11($r_ptr)
-
-	ret
-.size	__sqrx_384,.-__sqrx_384
-___
-}
-
-{ ########################################################## 384-bit redcx_mont
-my ($n_ptr, $n0)=($b_ptr, $n_ptr);      # arguments are "shifted"
-my ($lo, $hi) = ("%rax", "%rbp");
-
-$code.=<<___;
-########################################################################
-# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12],
-#                     uint64_t m[6], uint64_t n0);
-.globl	redcx_mont_384
-.hidden	redcx_mont_384
-.type	redcx_mont_384,\@function,4,"unwind"
-.align	32
-redcx_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	redcx_mont_384,.-redcx_mont_384
-
-########################################################################
-# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6],
-#                    uint64_t m[6], uint64_t n0);
-.globl	fromx_mont_384
-.hidden	fromx_mont_384
-.type	fromx_mont_384,\@function,4,"unwind"
-.align	32
-fromx_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$b_org, $n_ptr
-	call	__mulx_by_1_mont_384
-
-	#################################
-	# Branch-less conditional acc[0:6] - modulus
-
-	mov	@acc[6], %rax
-	mov	@acc[7], %rcx
-	mov	@acc[0], %rdx
-	mov	@acc[1], %rbp
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[7]
-	mov	@acc[2], @acc[5]
-	sbb	8*2($n_ptr), @acc[0]
-	sbb	8*3($n_ptr), @acc[1]
-	sbb	8*4($n_ptr), @acc[2]
-	mov	@acc[3], $a_ptr
-	sbb	8*5($n_ptr), @acc[3]
-
-	cmovc	%rax, @acc[6]
-	cmovc	%rcx, @acc[7]
-	cmovc	%rdx, @acc[0]
-	mov	@acc[6], 8*0($r_ptr)
-	cmovc	%rbp, @acc[1]
-	mov	@acc[7], 8*1($r_ptr)
-	cmovc	@acc[5], @acc[2]
-	mov	@acc[0], 8*2($r_ptr)
-	cmovc	$a_ptr,  @acc[3]
-	mov	@acc[1], 8*3($r_ptr)
-	mov	@acc[2], 8*4($r_ptr)
-	mov	@acc[3], 8*5($r_ptr)
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	fromx_mont_384,.-fromx_mont_384
-___
-{ my @acc=@acc;				# will be rotated locally
-
-$code.=<<___;
-.type	__mulx_by_1_mont_384,\@abi-omnipotent
-.align	32
-__mulx_by_1_mont_384:
-	mov	8*0($a_ptr), @acc[0]
-	mov	$n0, %rdx
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-___
-for (my $i=0; $i<6; $i++) {
-$code.=<<___;
-	imulq	@acc[0], %rdx
-
-	################################# reduction $i
-	xor	@acc[6], @acc[6]	# @acc[6]=0, cf=0, of=0
-	mulx	8*0($n_ptr), $lo, $hi
-	adcx	$lo, @acc[0]		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3($n_ptr), $lo, $hi
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	8*4($n_ptr), $lo, $hi
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	8*5($n_ptr), $lo, $hi
-	 mov	$n0, %rdx
-	adcx	$lo, @acc[5]
-	adox	@acc[6], $hi
-	adcx	$hi, @acc[6]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	ret
-.size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
-
-.type	__redc_tail_mont_384,\@abi-omnipotent
-.align	32
-__redc_tail_mont_384:
-	add	8*6($a_ptr), @acc[0]	# accumulate upper half
-	mov	@acc[0], %rax
-	adc	8*7($a_ptr), @acc[1]
-	adc	8*8($a_ptr), @acc[2]
-	adc	8*9($a_ptr), @acc[3]
-	mov	@acc[1], %rcx
-	adc	8*10($a_ptr), @acc[4]
-	adc	8*11($a_ptr), @acc[5]
-	sbb	@acc[6], @acc[6]
-
-	#################################
-	# Branch-less conditional acc[0:6] - modulus
-
-	mov	@acc[2], %rdx
-	mov	@acc[3], %rbp
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	mov	@acc[4], @acc[7]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	mov	@acc[5], $a_ptr
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, @acc[6]
-
-	cmovc	%rax, @acc[0]
-	cmovc	%rcx, @acc[1]
-	cmovc	%rdx, @acc[2]
-	mov	@acc[0], 8*0($r_ptr)
-	cmovc	%rbp, @acc[3]
-	mov	@acc[1], 8*1($r_ptr)
-	cmovc	@acc[7], @acc[4]
-	mov	@acc[2], 8*2($r_ptr)
-	cmovc	$a_ptr,  @acc[5]
-	mov	@acc[3], 8*3($r_ptr)
-	mov	@acc[4], 8*4($r_ptr)
-	mov	@acc[5], 8*5($r_ptr)
-
-	ret
-.size	__redc_tail_mont_384,.-__redc_tail_mont_384
-
-.globl	sgn0x_pty_mont_384
-.hidden	sgn0x_pty_mont_384
-.type	sgn0x_pty_mont_384,\@function,3,"unwind"
-.align	32
-sgn0x_pty_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$a_ptr, $n_ptr
-	lea	0($r_ptr), $a_ptr
-	mov	$b_org, $n0
-	call	__mulx_by_1_mont_384
-
-	xor	%rax, %rax
-	mov	@acc[0], @acc[7]
-	add	@acc[0], @acc[0]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[0]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	not	%rax			# 2*x > p, which means "negative"
-	and	\$1, @acc[7]
-	and	\$2, %rax
-	or	@acc[7], %rax		# pack sign and parity
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0x_pty_mont_384,.-sgn0x_pty_mont_384
-
-.globl	sgn0x_pty_mont_384x
-.hidden	sgn0x_pty_mont_384x
-.type	sgn0x_pty_mont_384x,\@function,3,"unwind"
-.align	32
-sgn0x_pty_mont_384x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8, %rsp
-.cfi_adjust_cfa_offset	8
-.cfi_end_prologue
-
-	mov	$a_ptr, $n_ptr
-	lea	48($r_ptr), $a_ptr	# sgn0(a->im)
-	mov	$b_org, $n0
-	call	__mulx_by_1_mont_384
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	lea	0($r_ptr), $a_ptr	# sgn0(a->re)
-	xor	$r_ptr, $r_ptr
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, $r_ptr
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, $r_ptr
-
-	mov	@acc[0], 0(%rsp)	# a->im is zero or not
-	not	$r_ptr			# 2*x > p, which means "negative"
-	and	\$1, @acc[7]
-	and	\$2, $r_ptr
-	or	@acc[7], $r_ptr		# pack sign and parity
-
-	call	__mulx_by_1_mont_384
-
-	mov	@acc[0], @acc[6]
-	or	@acc[1], @acc[0]
-	or	@acc[2], @acc[0]
-	or	@acc[3], @acc[0]
-	or	@acc[4], @acc[0]
-	or	@acc[5], @acc[0]
-
-	xor	%rax, %rax
-	mov	@acc[6], @acc[7]
-	add	@acc[6], @acc[6]
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-	adc	\$0, %rax
-
-	sub	8*0($n_ptr), @acc[6]
-	sbb	8*1($n_ptr), @acc[1]
-	sbb	8*2($n_ptr), @acc[2]
-	sbb	8*3($n_ptr), @acc[3]
-	sbb	8*4($n_ptr), @acc[4]
-	sbb	8*5($n_ptr), @acc[5]
-	sbb	\$0, %rax
-
-	mov	0(%rsp), @acc[6]
-
-	not	%rax			# 2*x > p, which means "negative"
-
-	test	@acc[0], @acc[0]
-	cmovz	$r_ptr, @acc[7]		# a->re==0? prty(a->im) : prty(a->re)
-
-	test	@acc[6], @acc[6]
-	cmovnz	$r_ptr, %rax		# a->im!=0? sgn0(a->im) : sgn0(a->re)
-
-	and	\$1, @acc[7]
-	and	\$2, %rax
-	or	@acc[7], %rax		# pack sign and parity
-
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	48(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	56(%rsp),%rsp
-.cfi_adjust_cfa_offset	-56
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x
-___
-} }
-
-{ ########################################################## mulx/sqrx_mont
-my @acc = (@acc, "%rax");
-my ($lo,$hi)=("%rdi","%rbp");
-
-$code.=<<___;
-.globl	mulx_mont_384
-.hidden	mulx_mont_384
-.type	mulx_mont_384,\@function,5,"unwind"
-.align	32
-mulx_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-8*3(%rsp), %rsp
-.cfi_adjust_cfa_offset	8*3
-.cfi_end_prologue
-
-	mov	$b_org, $b_ptr		# evacuate from %rdx
-	mov	8*0($b_org), %rdx
-	mov	8*0($a_ptr), @acc[6]
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[4]
-	mov	$r_ptr, 8*2(%rsp)
-	mov	8*4($a_ptr), $lo
-	mov	8*5($a_ptr), $hi
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-	mov	$n0, (%rsp)
-
-	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
-	call	__mulx_mont_384
-
-	mov	8*3(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*4(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*5(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*6(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*7(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*8(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*9(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*9
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	mulx_mont_384,.-mulx_mont_384
-___
-{ my @acc=@acc;				# will be rotated locally
-
-$code.=<<___;
-.type	__mulx_mont_384,\@abi-omnipotent
-.align	32
-__mulx_mont_384:
-.cfi_startproc
-	mulx	@acc[7], @acc[6], @acc[2]
-	mulx	@acc[8], @acc[7], @acc[3]
-	add	@acc[6], @acc[1]
-	mulx	@acc[4], @acc[8], @acc[4]
-	adc	@acc[7], @acc[2]
-	mulx	$lo, $lo, @acc[5]
-	adc	@acc[8], @acc[3]
-	mulx	$hi, $hi, @acc[6]
-	 mov	8($b_ptr), %rdx
-	adc	$lo, @acc[4]
-	adc	$hi, @acc[5]
-	adc	\$0, @acc[6]
-	xor	@acc[7], @acc[7]
-
-___
-for (my $i=1; $i<6; $i++) {
-my $tt = $i==1 ? @acc[7] : $hi;
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
-$code.=<<___;
-	 mov	@acc[0], 16(%rsp)
-	 imulq	8(%rsp), @acc[0]
-
-	################################# Multiply by b[$i]
-	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
-	mulx	8*0+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[1]
-	adcx	$hi, @acc[2]
-
-	mulx	8*1+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[2]
-	adcx	$hi, @acc[3]
-
-	mulx	8*2+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[3]
-	adcx	$hi, @acc[4]
-
-	mulx	8*3+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[4]
-	adcx	$hi, @acc[5]
-
-	mulx	8*4+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[5]
-	adcx	$hi, @acc[6]
-
-	mulx	8*5+128($a_ptr), $lo, $hi
-	 mov	@acc[0], %rdx
-	adox	$lo, @acc[6]
-	adcx	$hi, @acc[7]		# cf=0
-	adox	@acc[8], @acc[7]
-	adox	@acc[8], @acc[8]
-
-	################################# reduction
-	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
-	mulx	8*0+128($n_ptr), $lo, $hi
-	adcx	16(%rsp), $lo		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	8*4+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	8*5+128($n_ptr), $lo, $hi
-	 mov	$b_next, %rdx
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-	adcx	@acc[0], @acc[6]
-	adox	@acc[0], @acc[7]
-	adcx	@acc[0], @acc[7]
-	adox	@acc[0], @acc[8]
-	adcx	@acc[0], @acc[8]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	imulq	8(%rsp), %rdx
-	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
-
-	################################# last reduction
-	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
-	mulx	8*0+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[0]		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-	 mov	@acc[2], @acc[0]
-
-	mulx	8*4+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-	 mov	@acc[3], $a_ptr
-
-	mulx	8*5+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-	 mov	@acc[1], %rdx
-	adcx	@acc[8], @acc[6]
-	adox	@acc[8], @acc[7]
-	 lea	128($n_ptr), $n_ptr
-	 mov	@acc[4], @acc[8]
-	adc	\$0, @acc[7]
-
-	#################################
-	# Branch-less conditional acc[1:7] - modulus
-
-	sub	8*0($n_ptr), @acc[1]
-	sbb	8*1($n_ptr), @acc[2]
-	 mov	@acc[5], $lo
-	sbb	8*2($n_ptr), @acc[3]
-	sbb	8*3($n_ptr), @acc[4]
-	sbb	8*4($n_ptr), @acc[5]
-	 mov	@acc[6], $hi
-	sbb	8*5($n_ptr), @acc[6]
-	sbb	\$0, @acc[7]
-
-	cmovnc	@acc[1], %rdx
-	cmovc	@acc[0], @acc[2]
-	cmovc	$a_ptr, @acc[3]
-	cmovnc	@acc[4], @acc[8]
-	mov	%rdx, 8*0($b_ptr)
-	cmovnc	@acc[5], $lo
-	mov	@acc[2], 8*1($b_ptr)
-	cmovnc	@acc[6], $hi
-	mov	@acc[3], 8*2($b_ptr)
-	mov	@acc[8], 8*3($b_ptr)
-	mov	$lo, 8*4($b_ptr)
-	mov	$hi, 8*5($b_ptr)
-
-	ret
-.cfi_endproc
-.size	__mulx_mont_384,.-__mulx_mont_384
-___
-}
-$code.=<<___;
-.globl	sqrx_mont_384
-.hidden	sqrx_mont_384
-.type	sqrx_mont_384,\@function,4,"unwind"
-.align	32
-sqrx_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-8*3(%rsp), %rsp
-.cfi_adjust_cfa_offset	8*3
-.cfi_end_prologue
-
-	mov	$n_ptr, $n0		# n0
-	lea	-128($b_org), $n_ptr	# control u-op density
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	8*3($a_ptr), @acc[4]
-	mov	$r_ptr, 8*2(%rsp)
-	mov	8*4($a_ptr), $lo
-	mov	8*5($a_ptr), $hi
-
-	lea	($a_ptr), $b_ptr
-	mov	$n0, (%rsp)		# n0
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-
-	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
-	call	__mulx_mont_384		# as fast as dedicated squaring
-
-	mov	8*3(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*4(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*5(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*6(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*7(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*8(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*9(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*9
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_mont_384,.-sqrx_mont_384
-
-.globl	sqrx_n_mul_mont_384
-.hidden	sqrx_n_mul_mont_384
-.type	sqrx_n_mul_mont_384,\@function,6,"unwind"
-.align	32
-sqrx_n_mul_mont_384:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-8*5(%rsp), %rsp
-.cfi_adjust_cfa_offset	8*5
-.cfi_end_prologue
-
-	mov	$b_org, @acc[2]		# loop counter
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	$a_ptr, $b_ptr
-	mov	8*3($a_ptr), @acc[4]
-	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_384
-	mov	8*4($a_ptr), $lo
-	mov	8*5($a_ptr), $hi
-
-	mov	$n0, (%rsp)
-	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
-	movq	8*0(%r9), %xmm2		# prefetch b[0]
-
-.Loop_sqrx_384:
-	movd	@acc[2]d, %xmm1
-	lea	-128($b_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
-	call	__mulx_mont_384
-
-	movd	%xmm1, @acc[2]d
-	dec	@acc[2]d
-	jnz	.Loop_sqrx_384
-
-	mov	%rdx, @acc[6]
-	movq	%xmm2, %rdx		# b[0]
-	lea	-128($b_ptr), $a_ptr	# control u-op density
-	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	@acc[6],@acc[0],@acc[1]	# a[0]*b[0]
-	call	__mulx_mont_384
-
-	mov	8*5(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*6(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*7(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*8(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*9(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*10(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*11(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*11
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384
-
-.globl	sqrx_n_mul_mont_383
-.hidden	sqrx_n_mul_mont_383
-.type	sqrx_n_mul_mont_383,\@function,6,"unwind"
-.align	32
-sqrx_n_mul_mont_383:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-8*5(%rsp), %rsp
-.cfi_adjust_cfa_offset	8*5
-.cfi_end_prologue
-
-	mov	$b_org, @acc[2]		# loop counter
-	mov	8*0($a_ptr), %rdx
-	mov	8*1($a_ptr), @acc[7]
-	mov	8*2($a_ptr), @acc[8]
-	mov	$a_ptr, $b_ptr
-	mov	8*3($a_ptr), @acc[4]
-	mov	$r_ptr, 8*2(%rsp)	# to __mulx_mont_383_nonred
-	mov	8*4($a_ptr), $lo
-	mov	8*5($a_ptr), $hi
-
-	mov	$n0, (%rsp)
-	mov	%r9, 8*3(%rsp)		# 6th, multiplicand argument
-	movq	8*0(%r9), %xmm2		# prefetch b[0]
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-.Loop_sqrx_383:
-	movd	@acc[2]d, %xmm1
-	lea	-128($b_ptr), $a_ptr	# control u-op density
-
-	mulx	%rdx, @acc[0], @acc[1]	# a[0]*a[0]
-	call	__mulx_mont_383_nonred	# omitting full reduction gives ~15%
-					# in addition-chains
-	movd	%xmm1, @acc[2]d
-	dec	@acc[2]d
-	jnz	.Loop_sqrx_383
-
-	mov	%rdx, @acc[6]
-	movq	%xmm2, %rdx		# b[0]
-	lea	-128($b_ptr), $a_ptr	# control u-op density
-	mov	8*3(%rsp), $b_ptr	# 6th, multiplicand argument
-
-	mulx	@acc[6], @acc[0], @acc[1]	# a[0]*b[0]
-	call	__mulx_mont_384
-
-	mov	8*5(%rsp),%r15
-.cfi_restore	%r15
-	mov	8*6(%rsp),%r14
-.cfi_restore	%r14
-	mov	8*7(%rsp),%r13
-.cfi_restore	%r13
-	mov	8*8(%rsp),%r12
-.cfi_restore	%r12
-	mov	8*9(%rsp),%rbx
-.cfi_restore	%rbx
-	mov	8*10(%rsp),%rbp
-.cfi_restore	%rbp
-	lea	8*11(%rsp),%rsp
-.cfi_adjust_cfa_offset	-8*11
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383
-___
-{ my @acc=@acc;				# will be rotated locally
-
-$code.=<<___;
-.type	__mulx_mont_383_nonred,\@abi-omnipotent
-.align	32
-__mulx_mont_383_nonred:
-.cfi_startproc
-	mulx	@acc[7], @acc[6], @acc[2]
-	mulx	@acc[8], @acc[7], @acc[3]
-	add	@acc[6], @acc[1]
-	mulx	@acc[4], @acc[8], @acc[4]
-	adc	@acc[7], @acc[2]
-	mulx	$lo, $lo, @acc[5]
-	adc	@acc[8], @acc[3]
-	mulx	$hi, $hi, @acc[6]
-	 mov	8($b_ptr), %rdx
-	adc	$lo, @acc[4]
-	adc	$hi, @acc[5]
-	adc	\$0, @acc[6]
-___
-for (my $i=1; $i<6; $i++) {
-my $tt = $i==1 ? @acc[7] : $hi;
-my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1];
-$code.=<<___;
-	 mov	@acc[0], @acc[8]
-	 imulq	8(%rsp), @acc[0]
-
-	################################# Multiply by b[$i]
-	xor	@acc[7], @acc[7]	# @acc[8]=0, cf=0, of=0
-	mulx	8*0+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[1]
-	adcx	$hi, @acc[2]
-
-	mulx	8*1+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[2]
-	adcx	$hi, @acc[3]
-
-	mulx	8*2+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[3]
-	adcx	$hi, @acc[4]
-
-	mulx	8*3+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[4]
-	adcx	$hi, @acc[5]
-
-	mulx	8*4+128($a_ptr), $lo, $hi
-	adox	$lo, @acc[5]
-	adcx	$hi, @acc[6]
-
-	mulx	8*5+128($a_ptr), $lo, $hi
-	 mov	@acc[0], %rdx
-	adox	$lo, @acc[6]
-	adcx	@acc[7], $hi
-	adox	$hi, @acc[7]
-
-	################################# reduction
-	xor	@acc[0], @acc[0]	# acc[0]=0, cf=0, of=0
-	mulx	8*0+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[8]		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	8*4+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	8*5+128($n_ptr), $lo, $hi
-	 mov	$b_next, %rdx
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-	adcx	@acc[8], @acc[6]
-	adox	@acc[8], @acc[7]
-	adcx	@acc[8], @acc[7]
-___
-    push(@acc,shift(@acc));
-}
-$code.=<<___;
-	imulq	8(%rsp), %rdx
-	mov	8*3(%rsp), $b_ptr	# restore $r_ptr
-
-	################################# last reduction
-	xor	@acc[8], @acc[8]	# @acc[8]=0, cf=0, of=0
-	mulx	8*0+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[0]		# guaranteed to be zero
-	adox	$hi, @acc[1]
-
-	mulx	8*1+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[1]
-	adox	$hi, @acc[2]
-
-	mulx	8*2+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[2]
-	adox	$hi, @acc[3]
-
-	mulx	8*3+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[3]
-	adox	$hi, @acc[4]
-
-	mulx	8*4+128($n_ptr), $lo, $hi
-	adcx	$lo, @acc[4]
-	adox	$hi, @acc[5]
-
-	mulx	8*5+128($n_ptr), $lo, $hi
-	 mov	@acc[1], %rdx
-	adcx	$lo, @acc[5]
-	adox	$hi, @acc[6]
-	adc	\$0, @acc[6]
-	 mov	@acc[4], @acc[8]
-
-	mov	@acc[1], 8*0($b_ptr)
-	mov	@acc[2], 8*1($b_ptr)
-	mov	@acc[3], 8*2($b_ptr)
-	 mov	@acc[5], $lo
-	mov	@acc[4], 8*3($b_ptr)
-	mov	@acc[5], 8*4($b_ptr)
-	mov	@acc[6], 8*5($b_ptr)
-	 mov	@acc[6], $hi
-
-	ret
-.cfi_endproc
-.size	__mulx_mont_383_nonred,.-__mulx_mont_383_nonred
-___
-} } }
-{ my $frame = 4*8 +	# place for argument off-load +
-	      2*384/8 +	# place for 2 384-bit temporary vectors
-	      8;	# align
-my @acc = (@acc,"%rax","%rdx","%rbx","%rbp");
-
-# omitting 3 reductions gives ~10% better performance in add-chains
-$code.=<<___;
-.globl	sqrx_mont_382x
-.hidden	sqrx_mont_382x
-.type	sqrx_mont_382x,\@function,4,"unwind"
-.align	32
-sqrx_mont_382x:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$$frame, %rsp
-.cfi_adjust_cfa_offset	$frame
-.cfi_end_prologue
-
-	mov	$n_ptr, 8*0(%rsp)	# n0
-	mov	$b_org, $n_ptr		# n_ptr
-	mov	$r_ptr, 8*2(%rsp)
-	mov	$a_ptr, 8*3(%rsp)
-
-	#################################
-	mov	8*0($a_ptr), @acc[0]	# a->re
-	mov	8*1($a_ptr), @acc[1]
-	mov	8*2($a_ptr), @acc[2]
-	mov	8*3($a_ptr), @acc[3]
-	mov	8*4($a_ptr), @acc[4]
-	mov	8*5($a_ptr), @acc[5]
-
-	mov	@acc[0], @acc[6]
-	add	8*6($a_ptr), @acc[0]	# a->re + a->im
-	mov	@acc[1], @acc[7]
-	adc	8*7($a_ptr), @acc[1]
-	mov	@acc[2], @acc[8]
-	adc	8*8($a_ptr), @acc[2]
-	mov	@acc[3], @acc[9]
-	adc	8*9($a_ptr), @acc[3]
-	mov	@acc[4], @acc[10]
-	adc	8*10($a_ptr), @acc[4]
-	mov	@acc[5], @acc[11]
-	adc	8*11($a_ptr), @acc[5]
-
-	sub	8*6($a_ptr), @acc[6]	# a->re - a->im
-	sbb	8*7($a_ptr), @acc[7]
-	sbb	8*8($a_ptr), @acc[8]
-	sbb	8*9($a_ptr), @acc[9]
-	sbb	8*10($a_ptr), @acc[10]
-	sbb	8*11($a_ptr), @acc[11]
-	sbb	$r_ptr, $r_ptr		# borrow flag as mask
-
-	mov	@acc[0], 32+8*0(%rsp)	# t0
-	mov	@acc[1], 32+8*1(%rsp)
-	mov	@acc[2], 32+8*2(%rsp)
-	mov	@acc[3], 32+8*3(%rsp)
-	mov	@acc[4], 32+8*4(%rsp)
-	mov	@acc[5], 32+8*5(%rsp)
-
-	mov	@acc[6], 32+8*6(%rsp)	# t1
-	mov	@acc[7], 32+8*7(%rsp)
-	mov	@acc[8], 32+8*8(%rsp)
-	mov	@acc[9], 32+8*9(%rsp)
-	mov	@acc[10], 32+8*10(%rsp)
-	mov	@acc[11], 32+8*11(%rsp)
-	mov	$r_ptr,   32+8*12(%rsp)
-
-	################################# mul_mont_384(ret->im, a->re, a->im, mod, n0);
-	#mov	8*3(%rsp), $a_ptr	# a->re
-	lea	48($a_ptr), $b_ptr	# a->im
-
-	mov	48($a_ptr), %rdx
-	mov	8*0($a_ptr), %r14	# @acc[6]
-	mov	8*1($a_ptr), %r15	# @acc[7]
-	mov	8*2($a_ptr), %rax	# @acc[8]
-	mov	8*3($a_ptr), %r12	# @acc[4]
-	mov	8*4($a_ptr), %rdi	# $lo
-	mov	8*5($a_ptr), %rbp	# $hi
-	lea	-128($a_ptr), $a_ptr	# control u-op density
-	lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%r14, %r8, %r9
-	call	__mulx_mont_383_nonred
-___
-{
-my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
-                      8..11,13,14);
-$code.=<<___;
-	add	@acc[0], @acc[0]	# add with itself
-	adc	@acc[1], @acc[1]
-	adc	@acc[2], @acc[2]
-	adc	@acc[3], @acc[3]
-	adc	@acc[4], @acc[4]
-	adc	@acc[5], @acc[5]
-
-	mov	@acc[0],  8*6($b_ptr)	# ret->im
-	mov	@acc[1],  8*7($b_ptr)
-	mov	@acc[2],  8*8($b_ptr)
-	mov	@acc[3],  8*9($b_ptr)
-	mov	@acc[4],  8*10($b_ptr)
-	mov	@acc[5],  8*11($b_ptr)
-___
-}
-$code.=<<___;
-	################################# mul_mont_384(ret->re, t0, t1, mod, n0);
-	lea	32-128(%rsp), $a_ptr	# t0 [+u-op density]
-	lea	32+8*6(%rsp), $b_ptr	# t1
-
-	mov	32+8*6(%rsp), %rdx	# t1[0]
-	mov	32+8*0(%rsp), %r14	# @acc[6]
-	mov	32+8*1(%rsp), %r15	# @acc[7]
-	mov	32+8*2(%rsp), %rax	# @acc[8]
-	mov	32+8*3(%rsp), %r12	# @acc[4]
-	mov	32+8*4(%rsp), %rdi	# $lo
-	mov	32+8*5(%rsp), %rbp	# $hi
-	#lea	-128($a_ptr), $a_ptr	# control u-op density
-	#lea	-128($n_ptr), $n_ptr	# control u-op density
-
-	mulx	%r14, %r8, %r9
-	call	__mulx_mont_383_nonred
-___
-{
-my @acc = map("%r$_","dx",15,"ax",12,"di","bp",	# output from __mulx_mont_384
-                      8..11,13,14);
-$code.=<<___;
-	mov	32+8*12(%rsp), @acc[11]	# account for sign from a->re - a->im
-	lea	128($n_ptr), $n_ptr
-	mov	32+8*0(%rsp), @acc[6]
-	and	@acc[11], @acc[6]
-	mov	32+8*1(%rsp), @acc[7]
-	and	@acc[11], @acc[7]
-	mov	32+8*2(%rsp), @acc[8]
-	and	@acc[11], @acc[8]
-	mov	32+8*3(%rsp), @acc[9]
-	and	@acc[11], @acc[9]
-	mov	32+8*4(%rsp), @acc[10]
-	and	@acc[11], @acc[10]
-	and	32+8*5(%rsp), @acc[11]
-
-	sub	@acc[6], @acc[0]
-	mov	8*0($n_ptr), @acc[6]
-	sbb	@acc[7], @acc[1]
-	mov	8*1($n_ptr), @acc[7]
-	sbb	@acc[8], @acc[2]
-	mov	8*2($n_ptr), @acc[8]
-	sbb	@acc[9], @acc[3]
-	mov	8*3($n_ptr), @acc[9]
-	sbb	@acc[10], @acc[4]
-	mov	8*4($n_ptr), @acc[10]
-	sbb	@acc[11], @acc[5]
-	sbb	@acc[11], @acc[11]
-
-	and	@acc[11], @acc[6]
-	and	@acc[11], @acc[7]
-	and	@acc[11], @acc[8]
-	and	@acc[11], @acc[9]
-	and	@acc[11], @acc[10]
-	and	8*5($n_ptr), @acc[11]
-
-	add	@acc[6], @acc[0]
-	adc	@acc[7], @acc[1]
-	adc	@acc[8], @acc[2]
-	adc	@acc[9], @acc[3]
-	adc	@acc[10], @acc[4]
-	adc	@acc[11], @acc[5]
-
-	mov	@acc[0],  8*0($b_ptr)	# ret->re
-	mov	@acc[1],  8*1($b_ptr)
-	mov	@acc[2],  8*2($b_ptr)
-	mov	@acc[3],  8*3($b_ptr)
-	mov	@acc[4],  8*4($b_ptr)
-	mov	@acc[5],  8*5($b_ptr)
-___
-}
-$code.=<<___;
-	lea	$frame(%rsp), %r8	# size optimization
-	mov	8*0(%r8),%r15
-.cfi_restore	%r15
-	mov	8*1(%r8),%r14
-.cfi_restore	%r14
-	mov	8*2(%r8),%r13
-.cfi_restore	%r13
-	mov	8*3(%r8),%r12
-.cfi_restore	%r12
-	mov	8*4(%r8),%rbx
-.cfi_restore	%rbx
-	mov	8*5(%r8),%rbp
-.cfi_restore	%rbp
-	lea	8*6(%r8),%rsp
-.cfi_adjust_cfa_offset	-$frame-8*6
-.cfi_epilogue
-	ret
-.cfi_endproc
-.size	sqrx_mont_382x,.-sqrx_mont_382x
-___
-}
-
-print $code;
-close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-armv8.pl b/crypto/blst_src/asm/sha256-armv8.pl
deleted file mode 100755
index 1de27c70667..00000000000
--- a/crypto/blst_src/asm/sha256-armv8.pl
+++ /dev/null
@@ -1,541 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# sha256_block procedure for ARMv8.
-#
-# This module is stripped of scalar code paths, with raionale that all
-# known processors are NEON-capable.
-#
-# See original module at CRYPTOGAMS for further details.
-
-$flavour = shift;
-$output  = shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-$BITS=256;
-$SZ=4;
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-$rounds=64;
-$reg_t="w";
-$pre="blst_";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-$code.=<<___;
-.text
-
-.align	6
-.type	.LK$BITS,%object
-.LK$BITS:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-	.long	0	//terminator
-.size	.LK$BITS,.-.LK$BITS
-.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm"
-.align	2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-.globl	${pre}sha256_block_armv8
-.type	${pre}sha256_block_armv8,%function
-.align	6
-${pre}sha256_block_armv8:
-.Lv8_entry:
-	stp		x29,x30,[sp,#-16]!
-	add		x29,sp,#0
-
-	ld1.32		{$ABCD,$EFGH},[$ctx]
-	adr		$Ktbl,.LK256
-
-.Loop_hw:
-	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
-	sub		$num,$num,#1
-	ld1.32		{$W0},[$Ktbl],#16
-	rev32		@MSG[0],@MSG[0]
-	rev32		@MSG[1],@MSG[1]
-	rev32		@MSG[2],@MSG[2]
-	rev32		@MSG[3],@MSG[3]
-	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
-	orr		$EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	sha256su0	@MSG[0],@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-	sha256su1	@MSG[0],@MSG[2],@MSG[3]
-___
-	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	ld1.32		{$W0},[$Ktbl],#16
-	add.i32		$W1,$W1,@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	ld1.32		{$W1},[$Ktbl]
-	add.i32		$W0,$W0,@MSG[2]
-	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	add.i32		$W1,$W1,@MSG[3]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	add.i32		$ABCD,$ABCD,$ABCD_SAVE
-	add.i32		$EFGH,$EFGH,$EFGH_SAVE
-
-	cbnz		$num,.Loop_hw
-
-	st1.32		{$ABCD,$EFGH},[$ctx]
-
-	ldr		x29,[sp],#16
-	ret
-.size	${pre}sha256_block_armv8,.-${pre}sha256_block_armv8
-___
-}
-
-if ($SZ==4) {	######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T2,$T0,$sigma0[0]);
-	 eval(shift(@insns));
-	&ushr_32	($T1,$T0,$sigma0[2]);
-	 eval(shift(@insns));
-	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
-	 eval(shift(@insns));
-	&sli_32		($T2,$T0,32-$sigma0[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T3,$T0,$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T2);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&sli_32		($T3,$T0,32-$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T4,$T7,$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T4,$T7,32-$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T5,$T7,$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T3,$T7,$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_u32	($T3,$T7,32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T4);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T6,@X[0],$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T7,@X[0],$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T6,@X[0],32-$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T5,@X[0],$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T6);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T5,@X[0],32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl], #16");
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T5,$T5,$T5);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dhi($T5), &Dlo($T7));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 while($#insns>=1) { eval(shift(@insns)); }
-	&st1_32		("{$T0}","[$Xfer], #16");
-	 eval(shift(@insns));
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_8		("{@X[0]}","[$inp],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&rev32		(@X[0],@X[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 foreach (@insns) { eval; }	# remaining instructions
-	&st1_32		("{$T0}","[$Xfer], #16");
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub body_00_15 () {
-	(
-	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
-	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
-	'&and	($t1,$f,$e)',
-	'&bic	($t4,$g,$e)',
-	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
-	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
-	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
-	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
-	'&ror	($t0,$t0,"#$Sigma1[0]")',
-	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
-	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
-	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
-	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
-	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
-	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
-	'&ror	($t4,$t4,"#$Sigma0[0]")',
-	'&add	($d,$d,$h)',			# d+=h
-	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
-	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-	)
-}
-
-$code.=<<___;
-.globl	${pre}sha256_block_data_order
-.type	${pre}sha256_block_data_order,%function
-.align	4
-${pre}sha256_block_data_order:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	sub	sp,sp,#16*4
-
-	adr	$Ktbl,.LK256
-	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
-
-	ld1.8	{@X[0]},[$inp], #16
-	ld1.8	{@X[1]},[$inp], #16
-	ld1.8	{@X[2]},[$inp], #16
-	ld1.8	{@X[3]},[$inp], #16
-	ld1.32	{$T0},[$Ktbl], #16
-	ld1.32	{$T1},[$Ktbl], #16
-	ld1.32	{$T2},[$Ktbl], #16
-	ld1.32	{$T3},[$Ktbl], #16
-	rev32	@X[0],@X[0]		// yes, even on
-	rev32	@X[1],@X[1]		// big-endian
-	rev32	@X[2],@X[2]
-	rev32	@X[3],@X[3]
-	mov	$Xfer,sp
-	add.32	$T0,$T0,@X[0]
-	add.32	$T1,$T1,@X[1]
-	add.32	$T2,$T2,@X[2]
-	st1.32	{$T0-$T1},[$Xfer], #32
-	add.32	$T3,$T3,@X[3]
-	st1.32	{$T2-$T3},[$Xfer]
-	sub	$Xfer,$Xfer,#32
-
-	ldp	$A,$B,[$ctx]
-	ldp	$C,$D,[$ctx,#8]
-	ldp	$E,$F,[$ctx,#16]
-	ldp	$G,$H,[$ctx,#24]
-	ldr	$t1,[sp,#0]
-	mov	$t2,wzr
-	eor	$t3,$B,$C
-	mov	$t4,wzr
-	b	.L_00_48
-
-.align	4
-.L_00_48:
-___
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-$code.=<<___;
-	cmp	$t1,#0				// check for K256 terminator
-	ldr	$t1,[sp,#0]
-	sub	$Xfer,$Xfer,#64
-	bne	.L_00_48
-
-	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
-	cmp	$inp,$num
-	mov	$Xfer, #64
-	csel	$Xfer, $Xfer, xzr, eq
-	sub	$inp,$inp,$Xfer			// avoid SEGV
-	mov	$Xfer,sp
-___
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-$code.=<<___;
-	add	$A,$A,$t4			// h+=Sigma0(a) from the past
-	ldp	$t0,$t1,[$ctx,#0]
-	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
-	ldp	$t2,$t3,[$ctx,#8]
-	add	$A,$A,$t0			// accumulate
-	add	$B,$B,$t1
-	ldp	$t0,$t1,[$ctx,#16]
-	add	$C,$C,$t2
-	add	$D,$D,$t3
-	ldp	$t2,$t3,[$ctx,#24]
-	add	$E,$E,$t0
-	add	$F,$F,$t1
-	 ldr	$t1,[sp,#0]
-	stp	$A,$B,[$ctx,#0]
-	add	$G,$G,$t2
-	 mov	$t2,wzr
-	stp	$C,$D,[$ctx,#8]
-	add	$H,$H,$t3
-	stp	$E,$F,[$ctx,#16]
-	 eor	$t3,$B,$C
-	stp	$G,$H,[$ctx,#24]
-	 mov	$t4,wzr
-	 mov	$Xfer,sp
-	b.ne	.L_00_48
-
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
-	ret
-.size	${pre}sha256_block_data_order,.-${pre}sha256_block_data_order
-___
-}
-
-{
-my ($out,$inp,$len) = map("x$_",(0..2));
-
-$code.=<<___;
-.globl	${pre}sha256_emit
-.hidden	${pre}sha256_emit
-.type	${pre}sha256_emit,%function
-.align	4
-${pre}sha256_emit:
-	ldp	x4,x5,[$inp]
-	ldp	x6,x7,[$inp,#16]
-#ifndef	__AARCH64EB__
-	rev	x4,x4
-	rev	x5,x5
-	rev	x6,x6
-	rev	x7,x7
-#endif
-	str	w4,[$out,#4]
-	lsr	x4,x4,#32
-	str	w5,[$out,#12]
-	lsr	x5,x5,#32
-	str	w6,[$out,#20]
-	lsr	x6,x6,#32
-	str	w7,[$out,#28]
-	lsr	x7,x7,#32
-	str	w4,[$out,#0]
-	str	w5,[$out,#8]
-	str	w6,[$out,#16]
-	str	w7,[$out,#24]
-	ret
-.size	${pre}sha256_emit,.-${pre}sha256_emit
-
-.globl	${pre}sha256_bcopy
-.hidden	${pre}sha256_bcopy
-.type	${pre}sha256_bcopy,%function
-.align	4
-${pre}sha256_bcopy:
-.Loop_bcopy:
-	ldrb	w3,[$inp],#1
-	sub	$len,$len,#1
-	strb	w3,[$out],#1
-	cbnz	$len,.Loop_bcopy
-	ret
-.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
-
-.globl	${pre}sha256_hcopy
-.hidden	${pre}sha256_hcopy
-.type	${pre}sha256_hcopy,%function
-.align	4
-${pre}sha256_hcopy:
-	ldp	x4,x5,[$inp]
-	ldp	x6,x7,[$inp,#16]
-	stp	x4,x5,[$out]
-	stp	x6,x7,[$out,#16]
-	ret
-.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
-___
-}
-
-{   my  %opcode = (
-	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
-	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
-
-    sub unsha256 {
-	my ($mnemonic,$arg)=@_;
-
-	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
-	&&
-	sprintf ".inst\t0x%08x\t//%s %s",
-			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
-			$mnemonic,$arg;
-    }
-}
-
-open SELF,$0;
-while(<SELF>) {
-        next if (/^#!/);
-        last if (!s/^#/\/\// and !/^$/);
-        print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
-	s/\`([^\`]*)\`/eval($1)/ge;
-
-	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
-	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
-	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
-
-	s/\.[ui]?8(\s)/$1/;
-	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
-	s/\.\w?32\b//		and s/\.16b/\.4s/g;
-	m/\bext\b/		and s/\.2d/\.16b/g	or
-	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
-
-	print $_,"\n";
-}
-
-close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-portable-x86_64.pl b/crypto/blst_src/asm/sha256-portable-x86_64.pl
deleted file mode 100755
index eca0564ebe7..00000000000
--- a/crypto/blst_src/asm/sha256-portable-x86_64.pl
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# sha256_block procedure for x86_64.
-#
-# Scalar-only version with minor twist minimizing 'lea' instructions.
-
-$flavour = shift;
-$output  = pop;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-$pre="blst_";
-$func="${pre}sha256_block_data_order";
-$TABLE="K256";
-$SZ=4;
-@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
-				"%r8d","%r9d","%r10d","%r11d");
-($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-$rounds=64;
-
-$ctx="%rdi";	# 1st arg, zapped by $a3
-$inp="%rsi";	# 2nd arg
-$Tbl="%rbp";
-
-$_ctx="16*$SZ+0*8(%rsp)";
-$_inp="16*$SZ+1*8(%rsp)";
-$_end="16*$SZ+2*8(%rsp)";
-$framesz="16*$SZ+3*8";
-
-sub ROUND_00_15()
-{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-  my $STRIDE=$SZ;
-  #   $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
-
-$code.=<<___;
-	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
-	mov	$f,$a2
-
-	xor	$e,$a0
-	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
-	xor	$g,$a2			# f^g
-
-	mov	$T1,`$SZ*($i&0xf)`(%rsp)
-	xor	$a,$a1
-	and	$e,$a2			# (f^g)&e
-
-	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
-	add	$h,$T1			# T1+=h
-	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
-
-	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
-	xor	$e,$a0
-	add	$a2,$T1			# T1+=Ch(e,f,g)
-
-	mov	$a,$a2
-	add	`$SZ*$i`($Tbl),$T1	# T1+=K[round]
-	xor	$a,$a1
-
-	xor	$b,$a2			# a^b, b^c in next round
-	ror	\$$Sigma1[0],$a0	# Sigma1(e)
-	mov	$b,$h
-
-	and	$a2,$a3
-	ror	\$$Sigma0[0],$a1	# Sigma0(a)
-	add	$a0,$T1			# T1+=Sigma1(e)
-
-	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
-	add	$T1,$d			# d+=T1
-	add	$T1,$h			# h+=T1
-___
-$code.=<<___ if ($i==31);
-	lea	`16*$SZ`($Tbl),$Tbl	# round+=16
-___
-$code.=<<___ if ($i<15);
-	add	$a1,$h			# h+=Sigma0(a)
-___
-	($a2,$a3) = ($a3,$a2);
-}
-
-sub ROUND_16_XX()
-{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
-	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
-	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
-
-	mov	$a0,$T1
-	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
-	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
-	mov	$a2,$a1
-	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
-
-	xor	$T1,$a0
-	shr	\$$sigma0[2],$T1
-	ror	\$$sigma0[0],$a0
-	xor	$a1,$a2
-	shr	\$$sigma1[2],$a1
-
-	ror	\$$sigma1[0],$a2
-	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
-	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
-	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
-
-	add	`$SZ*($i&0xf)`(%rsp),$T1
-	mov	$e,$a0
-	add	$a2,$T1
-	mov	$a,$a1
-___
-	&ROUND_00_15(@_);
-}
-
-$code=<<___;
-.text
-
-.globl	$func
-.type	$func,\@function,3,"unwind"
-.align	16
-$func:
-.cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	shl	\$4,%rdx		# num*16
-	sub	\$$framesz,%rsp
-.cfi_adjust_cfa_offset	$framesz
-	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
-	mov	$ctx,$_ctx		# save ctx, 1st arg
-	mov	$inp,$_inp		# save inp, 2nd arh
-	mov	%rdx,$_end		# save end pointer, "3rd" arg
-.cfi_end_prologue
-
-	mov	$SZ*0($ctx),$A
-	mov	$SZ*1($ctx),$B
-	mov	$SZ*2($ctx),$C
-	mov	$SZ*3($ctx),$D
-	mov	$SZ*4($ctx),$E
-	mov	$SZ*5($ctx),$F
-	mov	$SZ*6($ctx),$G
-	mov	$SZ*7($ctx),$H
-	jmp	.Lloop
-
-.align	16
-.Lloop:
-	mov	$B,$a3
-	lea	$TABLE(%rip),$Tbl
-	xor	$C,$a3			# magic
-___
-	for($i=0;$i<16;$i++) {
-		$code.="	mov	$SZ*$i($inp),$T1\n";
-		$code.="	mov	@ROT[4],$a0\n";
-		$code.="	mov	@ROT[0],$a1\n";
-		$code.="	bswap	$T1\n";
-		&ROUND_00_15($i,@ROT);
-		unshift(@ROT,pop(@ROT));
-	}
-$code.=<<___;
-	jmp	.Lrounds_16_xx
-.align	16
-.Lrounds_16_xx:
-___
-	for(;$i<32;$i++) {
-		&ROUND_16_XX($i,@ROT);
-		unshift(@ROT,pop(@ROT));
-	}
-
-$code.=<<___;
-	cmpb	\$0x19,`$SZ-1`($Tbl)
-	jnz	.Lrounds_16_xx
-
-	mov	$_ctx,$ctx
-	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
-	lea	16*$SZ($inp),$inp
-
-	add	$SZ*0($ctx),$A
-	add	$SZ*1($ctx),$B
-	add	$SZ*2($ctx),$C
-	add	$SZ*3($ctx),$D
-	add	$SZ*4($ctx),$E
-	add	$SZ*5($ctx),$F
-	add	$SZ*6($ctx),$G
-	add	$SZ*7($ctx),$H
-
-	cmp	$_end,$inp
-
-	mov	$A,$SZ*0($ctx)
-	mov	$B,$SZ*1($ctx)
-	mov	$C,$SZ*2($ctx)
-	mov	$D,$SZ*3($ctx)
-	mov	$E,$SZ*4($ctx)
-	mov	$F,$SZ*5($ctx)
-	mov	$G,$SZ*6($ctx)
-	mov	$H,$SZ*7($ctx)
-	jb	.Lloop
-
-	lea	$framesz+6*8(%rsp),%r11
-.cfi_def_cfa	%r11,8
-	mov	$framesz(%rsp),%r15
-.cfi_restore	%r15
-	mov	-40(%r11),%r14
-.cfi_restore	%r14
-	mov	-32(%r11),%r13
-.cfi_restore	%r13
-	mov	-24(%r11),%r12
-.cfi_restore	%r12
-	mov	-16(%r11),%rbp
-.cfi_restore	%rbp
-	mov	-8(%r11),%rbx
-.cfi_restore	%rbx
-.cfi_epilogue
-	lea	(%r11),%rsp
-	ret
-.cfi_endproc
-.size	$func,.-$func
-
-.align	64
-.type	$TABLE,\@object
-$TABLE:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
-___
-{
-my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
-                               ("%rdi","%rsi","%rdx");  # Unix order
-$code.=<<___;
-.globl	${pre}sha256_emit
-.hidden	${pre}sha256_emit
-.type	${pre}sha256_emit,\@abi-omnipotent
-.align	16
-${pre}sha256_emit:
-	mov	0($inp), %r8
-	mov	8($inp), %r9
-	mov	16($inp), %r10
-	bswap	%r8
-	mov	24($inp), %r11
-	bswap	%r9
-	mov	%r8d, 4($out)
-	bswap	%r10
-	mov	%r9d, 12($out)
-	bswap	%r11
-	mov	%r10d, 20($out)
-	shr	\$32, %r8
-	mov	%r11d, 28($out)
-	shr	\$32, %r9
-	mov	%r8d, 0($out)
-	shr	\$32, %r10
-	mov	%r9d, 8($out)
-	shr	\$32, %r11
-	mov	%r10d, 16($out)
-	mov	%r11d, 24($out)
-	ret
-.size	${pre}sha256_emit,.-${pre}sha256_emit
-
-.globl	${pre}sha256_bcopy
-.hidden	${pre}sha256_bcopy
-.type	${pre}sha256_bcopy,\@abi-omnipotent
-.align	16
-${pre}sha256_bcopy:
-	sub	$inp, $out
-.Loop_bcopy:
-	movzb	($inp), %eax
-	lea	1($inp), $inp
-	mov	%al, -1($out,$inp)
-	dec	$len
-	jnz	.Loop_bcopy
-	ret
-.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
-
-.globl	${pre}sha256_hcopy
-.hidden	${pre}sha256_hcopy
-.type	${pre}sha256_hcopy,\@abi-omnipotent
-.align	16
-${pre}sha256_hcopy:
-	mov	0($inp), %r8
-	mov	8($inp), %r9
-	mov	16($inp), %r10
-	mov	24($inp), %r11
-	mov	%r8, 0($out)
-	mov	%r9, 8($out)
-	mov	%r10, 16($out)
-	mov	%r11, 24($out)
-	ret
-.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
-___
-}
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/crypto/blst_src/asm/sha256-x86_64.pl b/crypto/blst_src/asm/sha256-x86_64.pl
deleted file mode 100755
index 22b376318fa..00000000000
--- a/crypto/blst_src/asm/sha256-x86_64.pl
+++ /dev/null
@@ -1,789 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# sha256_block procedure for x86_64.
-#
-# This module is stripped of AVX and even scalar code paths, with
-# raionale that
-#
-# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one*
-#    processor, venerable Sandy Bridge;
-# b) AVX2 incurs costly power transitions, which would be justifiable
-#    if AVX2 code was executing most of the time, which is not the
-#    case in the context;
-# c) all comtemporary processors support SSSE3, so that nobody would
-#    actually use scalar code path anyway;
-#
-# See original module at CRYPTOGAMS for further details.
-
-$flavour = shift;
-$output  = pop;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
-    or die "can't call $xlate: $!";
-
-$pre="blst_";
-$func="${pre}sha256_block_data_order";
-$TABLE="K256";
-$SZ=4;
-@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
-				"%r8d","%r9d","%r10d","%r11d");
-($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-$rounds=64;
-
-$ctx="%rdi";	# 1st arg, zapped by $a3
-$inp="%rsi";	# 2nd arg
-$Tbl="%rbp";
-
-$_ctx="16*$SZ+0*8(%rsp)";
-$_inp="16*$SZ+1*8(%rsp)";
-$_end="16*$SZ+2*8(%rsp)";
-$framesz="16*$SZ+3*8";
-
-$code=<<___;
-.text
-
-.align	64
-.type	$TABLE,\@object
-$TABLE:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
-	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
-	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
-	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm"
-___
-
-######################################################################
-# SIMD code paths
-#
-{{{
-######################################################################
-# Intel SHA Extensions implementation of SHA256 update function.
-#
-my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
-
-my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
-my @MSG=map("%xmm$_",(3..6));
-
-$code.=<<___;
-.globl	${pre}sha256_block_data_order_shaext
-.hidden	${pre}sha256_block_data_order_shaext
-.type	${pre}sha256_block_data_order_shaext,\@function,3,"unwind"
-.align	64
-${pre}sha256_block_data_order_shaext:
-.cfi_startproc
-___
-$code.=<<___ if ($win64);
-	sub	\$0x58,%rsp
-.cfi_adjust_cfa_offset	0x58
-	movaps	%xmm6,-0x58(%r11)
-.cfi_offset	%xmm6,-0x60
-	movaps	%xmm7,-0x48(%r11)
-.cfi_offset	%xmm7,-0x50
-	movaps	%xmm8,-0x38(%r11)
-.cfi_offset	%xmm8,-0x40
-	movaps	%xmm9,-0x28(%r11)
-.cfi_offset	%xmm9,-0x30
-	movaps	%xmm10,-0x18(%r11)
-.cfi_offset	%xmm10,-0x20
-.cfi_end_prologue
-___
-$code.=<<___;
-	lea		K256+0x80(%rip),$Tbl
-	movdqu		($ctx),$ABEF		# DCBA
-	movdqu		16($ctx),$CDGH		# HGFE
-	movdqa		0x100-0x80($Tbl),$TMP	# byte swap mask
-
-	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
-	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
-	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
-	movdqa		$TMP,$BSWAP		# offload
-	palignr		\$8,$CDGH,$ABEF		# ABEF
-	punpcklqdq	$Wi,$CDGH		# CDGH
-	jmp		.Loop_shaext
-
-.align	16
-.Loop_shaext:
-	movdqu		($inp),@MSG[0]
-	movdqu		0x10($inp),@MSG[1]
-	movdqu		0x20($inp),@MSG[2]
-	pshufb		$TMP,@MSG[0]
-	movdqu		0x30($inp),@MSG[3]
-
-	movdqa		0*16-0x80($Tbl),$Wi
-	paddd		@MSG[0],$Wi
-	pshufb		$TMP,@MSG[1]
-	movdqa		$CDGH,$CDGH_SAVE	# offload
-	sha256rnds2	$ABEF,$CDGH		# 0-3
-	pshufd		\$0x0e,$Wi,$Wi
-	nop
-	movdqa		$ABEF,$ABEF_SAVE	# offload
-	sha256rnds2	$CDGH,$ABEF
-
-	movdqa		1*16-0x80($Tbl),$Wi
-	paddd		@MSG[1],$Wi
-	pshufb		$TMP,@MSG[2]
-	sha256rnds2	$ABEF,$CDGH		# 4-7
-	pshufd		\$0x0e,$Wi,$Wi
-	lea		0x40($inp),$inp
-	sha256msg1	@MSG[1],@MSG[0]
-	sha256rnds2	$CDGH,$ABEF
-
-	movdqa		2*16-0x80($Tbl),$Wi
-	paddd		@MSG[2],$Wi
-	pshufb		$TMP,@MSG[3]
-	sha256rnds2	$ABEF,$CDGH		# 8-11
-	pshufd		\$0x0e,$Wi,$Wi
-	movdqa		@MSG[3],$TMP
-	palignr		\$4,@MSG[2],$TMP
-	nop
-	paddd		$TMP,@MSG[0]
-	sha256msg1	@MSG[2],@MSG[1]
-	sha256rnds2	$CDGH,$ABEF
-
-	movdqa		3*16-0x80($Tbl),$Wi
-	paddd		@MSG[3],$Wi
-	sha256msg2	@MSG[3],@MSG[0]
-	sha256rnds2	$ABEF,$CDGH		# 12-15
-	pshufd		\$0x0e,$Wi,$Wi
-	movdqa		@MSG[0],$TMP
-	palignr		\$4,@MSG[3],$TMP
-	nop
-	paddd		$TMP,@MSG[1]
-	sha256msg1	@MSG[3],@MSG[2]
-	sha256rnds2	$CDGH,$ABEF
-___
-for($i=4;$i<16-3;$i++) {
-$code.=<<___;
-	movdqa		$i*16-0x80($Tbl),$Wi
-	paddd		@MSG[0],$Wi
-	sha256msg2	@MSG[0],@MSG[1]
-	sha256rnds2	$ABEF,$CDGH		# 16-19...
-	pshufd		\$0x0e,$Wi,$Wi
-	movdqa		@MSG[1],$TMP
-	palignr		\$4,@MSG[0],$TMP
-	nop
-	paddd		$TMP,@MSG[2]
-	sha256msg1	@MSG[0],@MSG[3]
-	sha256rnds2	$CDGH,$ABEF
-___
-	push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-	movdqa		13*16-0x80($Tbl),$Wi
-	paddd		@MSG[0],$Wi
-	sha256msg2	@MSG[0],@MSG[1]
-	sha256rnds2	$ABEF,$CDGH		# 52-55
-	pshufd		\$0x0e,$Wi,$Wi
-	movdqa		@MSG[1],$TMP
-	palignr		\$4,@MSG[0],$TMP
-	sha256rnds2	$CDGH,$ABEF
-	paddd		$TMP,@MSG[2]
-
-	movdqa		14*16-0x80($Tbl),$Wi
-	paddd		@MSG[1],$Wi
-	sha256rnds2	$ABEF,$CDGH		# 56-59
-	pshufd		\$0x0e,$Wi,$Wi
-	sha256msg2	@MSG[1],@MSG[2]
-	movdqa		$BSWAP,$TMP
-	sha256rnds2	$CDGH,$ABEF
-
-	movdqa		15*16-0x80($Tbl),$Wi
-	paddd		@MSG[2],$Wi
-	nop
-	sha256rnds2	$ABEF,$CDGH		# 60-63
-	pshufd		\$0x0e,$Wi,$Wi
-	dec		$num
-	nop
-	sha256rnds2	$CDGH,$ABEF
-
-	paddd		$CDGH_SAVE,$CDGH
-	paddd		$ABEF_SAVE,$ABEF
-	jnz		.Loop_shaext
-
-	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
-	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
-	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
-	punpckhqdq	$CDGH,$ABEF		# DCBA
-	palignr		\$8,$TMP,$CDGH		# HGFE
-
-	movdqu	$ABEF,($ctx)
-	movdqu	$CDGH,16($ctx)
-___
-$code.=<<___ if ($win64);
-	movaps	-0x58(%r11),%xmm6
-	movaps	-0x48(%r11),%xmm7
-	movaps	-0x38(%r11),%xmm8
-	movaps	-0x28(%r11),%xmm9
-	movaps	-0x18(%r11),%xmm10
-	mov	%r11,%rsp
-.cfi_def_cfa	%r11,8
-.cfi_epilogue
-___
-$code.=<<___;
-	ret
-.cfi_endproc
-.size	${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext
-___
-}}}
-{{{
-
-my $a4=$T1;
-my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-  my $arg = pop;
-    $arg = "\$$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-sub body_00_15 () {
-	(
-	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
-
-	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
-	'&mov	($a,$a1)',
-	'&mov	($a4,$f)',
-
-	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
-	'&xor	($a0,$e)',
-	'&xor	($a4,$g)',			# f^g
-
-	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
-	'&xor	($a1,$a)',
-	'&and	($a4,$e)',			# (f^g)&e
-
-	'&xor	($a0,$e)',
-	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
-	'&mov	($a2,$a)',
-
-	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
-	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
-	'&xor	($a2,$b)',			# a^b, b^c in next round
-
-	'&add	($h,$a4)',			# h+=Ch(e,f,g)
-	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
-	'&and	($a3,$a2)',			# (b^c)&(a^b)
-
-	'&xor	($a1,$a)',
-	'&add	($h,$a0)',			# h+=Sigma1(e)
-	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
-
-	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
-	'&add	($d,$h)',			# d+=h
-	'&add	($h,$a3)',			# h+=Maj(a,b,c)
-
-	'&mov	($a0,$d)',
-	'&add	($a1,$h);'.			# h+=Sigma0(a)
-	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
-	);
-}
-
-######################################################################
-# SSSE3 code path
-#
-{
-my $Tbl = $inp;
-my $_ctx="0(%rbp)";
-my $_inp="8(%rbp)";
-my $_end="16(%rbp)";
-my $framesz=4*8+$win64*16*4+8;
-
-my @X = map("%xmm$_",(0..3));
-my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
-
-$code.=<<___;
-.globl	${func}
-.hidden	${func}
-.type	${func},\@function,3,"unwind"
-.align	64
-${func}:
-.cfi_startproc
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	shl	\$4,%rdx		# num*16
-	sub	\$$framesz,%rsp
-.cfi_adjust_cfa_offset	$framesz
-	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
-	mov	$ctx,0(%rsp)		# save ctx, 1st arg
-	#mov	$inp,8(%rsp)		# save inp, 2nd arg
-	mov	%rdx,16(%rsp)		# save end pointer, "3rd" arg
-___
-$code.=<<___ if ($win64);
-	movaps	%xmm6,0x20(%rsp)
-.cfi_offset	%xmm6,-0x78
-	movaps	%xmm7,0x30(%rsp)
-.cfi_offset	%xmm7,-0x68
-	movaps	%xmm8,0x40(%rsp)
-.cfi_offset	%xmm8,-0x58
-	movaps	%xmm9,0x50(%rsp)
-.cfi_offset	%xmm9,-0x48
-___
-$code.=<<___;
-	mov	%rsp,%rbp
-.cfi_def_cfa_register	%rbp
-.cfi_end_prologue
-
-	lea	-16*$SZ(%rsp),%rsp
-	mov	$SZ*0($ctx),$A
-	and	\$-64,%rsp		# align stack
-	mov	$SZ*1($ctx),$B
-	mov	$SZ*2($ctx),$C
-	mov	$SZ*3($ctx),$D
-	mov	$SZ*4($ctx),$E
-	mov	$SZ*5($ctx),$F
-	mov	$SZ*6($ctx),$G
-	mov	$SZ*7($ctx),$H
-___
-
-$code.=<<___;
-	#movdqa	$TABLE+`$SZ*$rounds`+32(%rip),$t4
-	#movdqa	$TABLE+`$SZ*$rounds`+64(%rip),$t5
-	jmp	.Lloop_ssse3
-.align	16
-.Lloop_ssse3:
-	movdqa	$TABLE+`$SZ*$rounds`(%rip),$t3
-	mov	$inp,$_inp		# offload $inp
-	movdqu	0x00($inp),@X[0]
-	movdqu	0x10($inp),@X[1]
-	movdqu	0x20($inp),@X[2]
-	pshufb	$t3,@X[0]
-	movdqu	0x30($inp),@X[3]
-	lea	$TABLE(%rip),$Tbl
-	pshufb	$t3,@X[1]
-	movdqa	0x00($Tbl),$t0
-	movdqa	0x10($Tbl),$t1
-	pshufb	$t3,@X[2]
-	paddd	@X[0],$t0
-	movdqa	0x20($Tbl),$t2
-	pshufb	$t3,@X[3]
-	movdqa	0x30($Tbl),$t3
-	paddd	@X[1],$t1
-	paddd	@X[2],$t2
-	paddd	@X[3],$t3
-	movdqa	$t0,0x00(%rsp)
-	mov	$A,$a1
-	movdqa	$t1,0x10(%rsp)
-	mov	$B,$a3
-	movdqa	$t2,0x20(%rsp)
-	xor	$C,$a3			# magic
-	movdqa	$t3,0x30(%rsp)
-	mov	$E,$a0
-	jmp	.Lssse3_00_47
-
-.align	16
-.Lssse3_00_47:
-	sub	\$`-16*$SZ`,$Tbl	# size optimization
-___
-sub Xupdate_256_SSSE3 () {
-	(
-	'&movdqa	($t0,@X[1]);',
-	'&movdqa	($t3,@X[3])',
-	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
-	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
-	'&movdqa	($t1,$t0)',
-	'&movdqa	($t2,$t0);',
-	'&psrld		($t0,$sigma0[2])',
-	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
-	'&psrld		($t2,$sigma0[0])',
-	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
-	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
-	'&pxor		($t0,$t2)',
-	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
-	'&pxor		($t0,$t1)',
-	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
-	'&pxor		($t0,$t2);',
-	 '&movdqa	($t2,$t3)',
-	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
-	 '&psrld	($t3,$sigma1[2])',
-	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
-	 '&psrlq	($t2,$sigma1[0])',
-	 '&pxor		($t3,$t2);',
-	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
-	 '&pxor		($t3,$t2)',
-	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
-	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
-	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
-	 '&movdqa	($t2,$t3);',
-	 '&psrld	($t3,$sigma1[2])',
-	 '&psrlq	($t2,$sigma1[0])',
-	 '&pxor		($t3,$t2);',
-	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
-	 '&pxor		($t3,$t2);',
-	'&movdqa	($t2,16*$j."($Tbl)")',
-	 '&pshufb	($t3,$t5)',
-	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
-	);
-}
-
-sub SSSE3_256_00_47 () {
-my $j = shift;
-my $body = shift;
-my @X = @_;
-my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
-
-    if (0) {
-	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
-	    eval;
-	    eval(shift(@insns));
-	    eval(shift(@insns));
-	    eval(shift(@insns));
-	}
-    } else {			# squeeze extra 4% on Westmere and 19% on Atom
-	  eval(shift(@insns));	#@
-	&movdqa		($t0,@X[1]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&movdqa		($t3,@X[3]);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	&palignr	($t0,@X[0],$SZ);	# X[1..4]
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	&movdqa		($t1,$t0);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&movdqa		($t2,$t0);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	&psrld		($t0,$sigma0[2]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	&psrld		($t2,$sigma0[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	&pslld		($t1,8*$SZ-$sigma0[1]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&pxor		($t0,$t2);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	&psrld		($t2,$sigma0[1]-$sigma0[0]);
-	  eval(shift(@insns));
-	&pxor		($t0,$t1);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&pslld		($t1,$sigma0[1]-$sigma0[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&pxor		($t0,$t2);
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	 &movdqa	($t2,$t3);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&pxor		($t0,$t1);		# sigma0(X[1..4])
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &psrld		($t3,$sigma1[2]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	 &psrlq		($t2,$sigma1[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pxor		($t3,$t2);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pxor		($t3,$t2);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
-	 &pshufd	($t3,$t3,0b10000000);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &psrldq	($t3,8);
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	 &movdqa	($t2,$t3);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &psrld		($t3,$sigma1[2]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	 &psrlq		($t2,$sigma1[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pxor		($t3,$t2);
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	 &pxor		($t3,$t2);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));	#@
-	 #&pshufb	($t3,$t5);
-	 &pshufd	($t3,$t3,0b00001000);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&movdqa		($t2,16*$j."($Tbl)");
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	 &pslldq	($t3,8);
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
-	  eval(shift(@insns));	#@
-	  eval(shift(@insns));
-	  eval(shift(@insns));
-    }
-	&paddd		($t2,@X[0]);
-	  foreach (@insns) { eval; }		# remaining instructions
-	&movdqa		(16*$j."(%rsp)",$t2);
-}
-
-    for ($i=0,$j=0; $j<4; $j++) {
-	&SSSE3_256_00_47($j,\&body_00_15,@X);
-	push(@X,shift(@X));			# rotate(@X)
-    }
-	&cmpb	($SZ-1+16*$SZ."($Tbl)",0);
-	&jne	(".Lssse3_00_47");
-
-    for ($i=0; $i<16; ) {
-	foreach(body_00_15()) { eval; }
-    }
-$code.=<<___;
-	mov	$_ctx,$ctx
-	mov	$a1,$A
-	mov	$_inp,$inp
-
-	add	$SZ*0($ctx),$A
-	add	$SZ*1($ctx),$B
-	add	$SZ*2($ctx),$C
-	add	$SZ*3($ctx),$D
-	add	$SZ*4($ctx),$E
-	add	$SZ*5($ctx),$F
-	add	$SZ*6($ctx),$G
-	add	$SZ*7($ctx),$H
-
-	lea	16*$SZ($inp),$inp
-	cmp	$_end,$inp
-
-	mov	$A,$SZ*0($ctx)
-	mov	$B,$SZ*1($ctx)
-	mov	$C,$SZ*2($ctx)
-	mov	$D,$SZ*3($ctx)
-	mov	$E,$SZ*4($ctx)
-	mov	$F,$SZ*5($ctx)
-	mov	$G,$SZ*6($ctx)
-	mov	$H,$SZ*7($ctx)
-	jb	.Lloop_ssse3
-
-	xorps	%xmm0, %xmm0
-	lea	$framesz+6*8(%rbp),%r11
-.cfi_def_cfa	%r11,8
-	movaps	%xmm0, 0x00(%rsp)	# scrub the stack
-	movaps	%xmm0, 0x10(%rsp)
-	movaps	%xmm0, 0x20(%rsp)
-	movaps	%xmm0, 0x30(%rsp)
-___
-$code.=<<___ if ($win64);
-	movaps	0x20(%rbp),%xmm6
-	movaps	0x30(%rbp),%xmm7
-	movaps	0x40(%rbp),%xmm8
-	movaps	0x50(%rbp),%xmm9
-___
-$code.=<<___;
-	mov	$framesz(%rbp),%r15
-.cfi_restore	%r15
-	mov	-40(%r11),%r14
-.cfi_restore	%r14
-	mov	-32(%r11),%r13
-.cfi_restore	%r13
-	mov	-24(%r11),%r12
-.cfi_restore	%r12
-	mov	-16(%r11),%rbx
-.cfi_restore	%rbx
-	mov	-8(%r11),%rbp
-.cfi_restore	%rbp
-.cfi_epilogue
-	lea	(%r11),%rsp
-	ret
-.cfi_endproc
-.size	${func},.-${func}
-___
-}
-}}}
-{
-my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") :  # Win64 order
-                               ("%rdi","%rsi","%rdx");  # Unix order
-$code.=<<___;
-.globl	${pre}sha256_emit
-.hidden	${pre}sha256_emit
-.type	${pre}sha256_emit,\@abi-omnipotent
-.align	16
-${pre}sha256_emit:
-	mov	0($inp), %r8
-	mov	8($inp), %r9
-	mov	16($inp), %r10
-	bswap	%r8
-	mov	24($inp), %r11
-	bswap	%r9
-	mov	%r8d, 4($out)
-	bswap	%r10
-	mov	%r9d, 12($out)
-	bswap	%r11
-	mov	%r10d, 20($out)
-	shr	\$32, %r8
-	mov	%r11d, 28($out)
-	shr	\$32, %r9
-	mov	%r8d, 0($out)
-	shr	\$32, %r10
-	mov	%r9d, 8($out)
-	shr	\$32, %r11
-	mov	%r10d, 16($out)
-	mov	%r11d, 24($out)
-	ret
-.size	${pre}sha256_emit,.-${pre}sha256_emit
-
-.globl	${pre}sha256_bcopy
-.hidden	${pre}sha256_bcopy
-.type	${pre}sha256_bcopy,\@abi-omnipotent
-.align	16
-${pre}sha256_bcopy:
-	sub	$inp, $out
-.Loop_bcopy:
-	movzb	($inp), %eax
-	lea	1($inp), $inp
-	mov	%al, -1($out,$inp)
-	dec	$len
-	jnz	.Loop_bcopy
-	ret
-.size	${pre}sha256_bcopy,.-${pre}sha256_bcopy
-
-.globl	${pre}sha256_hcopy
-.hidden	${pre}sha256_hcopy
-.type	${pre}sha256_hcopy,\@abi-omnipotent
-.align	16
-${pre}sha256_hcopy:
-	mov	0($inp), %r8
-	mov	8($inp), %r9
-	mov	16($inp), %r10
-	mov	24($inp), %r11
-	mov	%r8, 0($out)
-	mov	%r9, 8($out)
-	mov	%r10, 16($out)
-	mov	%r11, 24($out)
-	ret
-.size	${pre}sha256_hcopy,.-${pre}sha256_hcopy
-___
-}
-
-sub sha256op38 {
-    my $instr = shift;
-    my %opcodelet = (
-		"sha256rnds2" => 0xcb,
-  		"sha256msg1"  => 0xcc,
-		"sha256msg2"  => 0xcd	);
-
-    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
-      my @opcode=(0x0f,0x38);
-	push @opcode,$opcodelet{$instr};
-	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
-	return ".byte\t".join(',',@opcode);
-    } else {
-	return $instr."\t".@_[0];
-    }
-}
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
-
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/crypto/blst_src/asm/x86_64-xlate.pl b/crypto/blst_src/asm/x86_64-xlate.pl
deleted file mode 100755
index 62be619d9fc..00000000000
--- a/crypto/blst_src/asm/x86_64-xlate.pl
+++ /dev/null
@@ -1,1781 +0,0 @@
-#!/usr/bin/env perl
-#
-# Copyright Supranational LLC
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm.
-#
-# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T
-# format is way easier to parse. Because it's simpler to "gear" from
-# Unix ABI to Windows one [see cross-reference "card" at the end of
-# file]. Because Linux targets were available first...
-#
-# In addition the script also "distills" code suitable for GNU
-# assembler, so that it can be compiled with more rigid assemblers,
-# such as Solaris /usr/ccs/bin/as.
-#
-# This translator is not designed to convert *arbitrary* assembler
-# code from AT&T format to MASM one. It's designed to convert just
-# enough to provide for dual-ABI OpenSSL modules development...
-# There *are* limitations and you might have to modify your assembler
-# code or this script to achieve the desired result...
-#
-# Currently recognized limitations:
-#
-# - can't use multiple ops per line;
-#
-# Dual-ABI styling rules.
-#
-# 1. Adhere to Unix register and stack layout [see cross-reference
-#    ABI "card" at the end for explanation].
-# 2. Forget about "red zone," stick to more traditional blended
-#    stack frame allocation. If volatile storage is actually required
-#    that is. If not, just leave the stack as is.
-# 3. Functions tagged with ".type name,@function" get crafted with
-#    unified Win64 prologue and epilogue automatically. If you want
-#    to take care of ABI differences yourself, tag functions as
-#    ".type name,@abi-omnipotent" instead.
-# 4. To optimize the Win64 prologue you can specify number of input
-#    arguments as ".type name,@function,N." Keep in mind that if N is
-#    larger than 6, then you *have to* write "abi-omnipotent" code,
-#    because >6 cases can't be addressed with unified prologue.
-# 5. Name local labels as .L*, do *not* use dynamic labels such as 1:
-#    (sorry about latter).
-# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is
-#    required to identify the spots, where to inject Win64 epilogue!
-#    But on the pros, it's then prefixed with rep automatically:-)
-# 7. Stick to explicit ip-relative addressing. If you have to use
-#    GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??.
-#    Both are recognized and translated to proper Win64 addressing
-#    modes.
-#
-# 8. In order to provide for structured exception handling unified
-#    Win64 prologue copies %rsp value to %rax. [Unless function is
-#    tagged with additional .type tag.] For further details see SEH
-#    paragraph at the end.
-# 9. .init segment is allowed to contain calls to functions only.
-# a. If function accepts more than 4 arguments *and* >4th argument
-#    is declared as non 64-bit value, do clear its upper part.
-
-
-use strict;
-
-my $flavour = shift;
-my $output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-open STDOUT,">$output" || die "can't open $output: $!"
-	if (defined($output));
-
-my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
-my $elf=1;	$elf=0 if (!$gas);
-my $dwarf=$elf;
-my $win64=0;
-my $prefix="";
-my $decor=".L";
-
-my $masmref=8 + 50727*2**-32;	# 8.00.50727 shipped with VS2005
-my $masm=0;
-my $PTR=" PTR";
-
-my $nasmref=2.03;
-my $nasm=0;
-
-if    ($flavour eq "mingw64")	{ $gas=1; $elf=0; $win64=1;
-				  $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`;
-				  $prefix =~ s|\R$||; # Better chomp
-				}
-elsif ($flavour eq "macosx")	{ $gas=1; $elf=0; $prefix="_"; $decor="L\$"; }
-elsif ($flavour eq "masm")	{ $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; }
-elsif ($flavour eq "nasm")	{ $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; }
-elsif (!$gas)
-{   if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i)
-    {	$nasm = $1 + $2*0.01; $PTR="";  }
-    elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/)
-    {	$masm = $1 + $2*2**-16 + $4*2**-32;   }
-    die "no assembler found on %PATH%" if (!($nasm || $masm));
-    $win64=1;
-    $elf=0;
-    $decor="\$L\$";
-}
-
-$dwarf=0 if($win64);
-
-my $current_segment;
-my $current_function;
-my %globals;
-
-{ package opcode;	# pick up opcodes
-    sub re {
-	my	($class, $line) = @_;
-	my	$self = {};
-	my	$ret;
-
-	if ($$line =~ /^([a-z][a-z0-9]*)/i) {
-	    bless $self,$class;
-	    $self->{op} = $1;
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-
-	    undef $self->{sz};
-	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
-		$self->{op} = $1;
-		$self->{sz} = $2;
-	    } elsif ($self->{op} =~ /cmov[n]?[lb]$/) {
-		# pass through
-	    } elsif ($self->{op} =~ /call|jmp/) {
-		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
-		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov
-		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) {
-		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
-		$self->{op} = $1;
-		$self->{sz} = $2;
-	    }
-	}
-	$ret;
-    }
-    sub size {
-	my ($self, $sz) = @_;
-	$self->{sz} = $sz if (defined($sz) && !defined($self->{sz}));
-	$self->{sz};
-    }
-    sub out {
-	my $self = shift;
-	if ($gas) {
-	    if ($self->{op} eq "movz") {	# movz is pain...
-		sprintf "%s%s%s",$self->{op},$self->{sz},shift;
-	    } elsif ($self->{op} =~ /^set/) {
-		"$self->{op}";
-	    } elsif ($self->{op} eq "ret") {
-		my $epilogue = "";
-		if ($win64 && $current_function->{abi} eq "svr4"
-			   && !$current_function->{unwind}) {
-		    $epilogue = "movq	8(%rsp),%rdi\n\t" .
-				"movq	16(%rsp),%rsi\n\t";
-		}
-		$epilogue . ".byte	0xf3,0xc3";
-	    } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") {
-		".p2align\t3\n\t.quad";
-	    } else {
-		"$self->{op}$self->{sz}";
-	    }
-	} else {
-	    $self->{op} =~ s/^movz/movzx/;
-	    if ($self->{op} eq "ret") {
-		$self->{op} = "";
-		if ($win64 && $current_function->{abi} eq "svr4"
-			   && !$current_function->{unwind}) {
-		    $self->{op} = "mov	rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t".
-				  "mov	rsi,QWORD$PTR\[16+rsp\]\n\t";
-	    	}
-		$self->{op} .= "DB\t0F3h,0C3h\t\t;repret";
-	    } elsif ($self->{op} =~ /^(pop|push)f/) {
-		$self->{op} .= $self->{sz};
-	    } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
-		$self->{op} = "\tDQ";
-	    }
-	    $self->{op};
-	}
-    }
-    sub mnemonic {
-	my ($self, $op) = @_;
-	$self->{op}=$op if (defined($op));
-	$self->{op};
-    }
-}
-{ package const;	# pick up constants, which start with $
-    sub re {
-	my	($class, $line) = @_;
-	my	$self = {};
-	my	$ret;
-
-	if ($$line =~ /^\$([^,]+)/) {
-	    bless $self, $class;
-	    $self->{value} = $1;
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-	}
-	$ret;
-    }
-    sub out {
-    	my $self = shift;
-
-	$self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig;
-	if ($gas) {
-	    # Solaris /usr/ccs/bin/as can't handle multiplications
-	    # in $self->{value}
-	    my $value = $self->{value};
-	    no warnings;    # oct might complain about overflow, ignore here...
-	    $value =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    if ($value =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg) {
-		$self->{value} = $value;
-	    }
-	    sprintf "\$%s",$self->{value};
-	} else {
-	    my $value = $self->{value};
-	    $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm);
-	    sprintf "%s",$value;
-	}
-    }
-}
-{ package ea;		# pick up effective addresses: expr(%reg,%reg,scale)
-
-    my %szmap = (	b=>"BYTE$PTR",    w=>"WORD$PTR",
-			l=>"DWORD$PTR",   d=>"DWORD$PTR",
-			q=>"QWORD$PTR",   o=>"OWORD$PTR",
-			x=>"XMMWORD$PTR", y=>"YMMWORD$PTR",
-			z=>"ZMMWORD$PTR" ) if (!$gas);
-
-    my %sifmap = (	ss=>"d",	sd=>"q",	# broadcast only
-			i32x2=>"q",	f32x2=>"q",
-			i32x4=>"x",	i64x2=>"x",	i128=>"x",
-			f32x4=>"x",	f64x2=>"x",	f128=>"x",
-			i32x8=>"y",	i64x4=>"y",
-			f32x8=>"y",	f64x4=>"y" ) if (!$gas);
-
-    sub re {
-	my	($class, $line, $opcode) = @_;
-	my	$self = {};
-	my	$ret;
-
-	# optional * ----vvv--- appears in indirect jmp/call
-	if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) {
-	    bless $self, $class;
-	    $self->{asterisk} = $1;
-	    $self->{label} = $2;
-	    ($self->{base},$self->{index},$self->{scale})=split(/,/,$3);
-	    $self->{scale} = 1 if (!defined($self->{scale}));
-	    $self->{opmask} = $4;
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-
-	    if ($win64 && $self->{label} =~ s/\@GOTPCREL//) {
-		die if ($opcode->mnemonic() ne "mov");
-		$opcode->mnemonic("lea");
-	    }
-	    $self->{base}  =~ s/^%//;
-	    $self->{index} =~ s/^%// if (defined($self->{index}));
-	    $self->{opcode} = $opcode;
-	}
-	$ret;
-    }
-    sub size {}
-    sub out {
-	my ($self, $sz) = @_;
-
-	$self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
-	$self->{label} =~ s/\.L/$decor/g;
-
-	# Silently convert all EAs to 64-bit. This is required for
-	# elder GNU assembler and results in more compact code,
-	# *but* most importantly AES module depends on this feature!
-	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
-	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
-
-	# Solaris /usr/ccs/bin/as can't handle multiplications
-	# in $self->{label}...
-	use integer;
-	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	$self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
-
-	# Some assemblers insist on signed presentation of 32-bit
-	# offsets, but sign extension is a tricky business in perl...
-	$self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg;
-
-	# if base register is %rbp or %r13, see if it's possible to
-	# flip base and index registers [for better performance]
-	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
-	    $self->{base} =~ /(rbp|r13)/) {
-		$self->{base} = $self->{index}; $self->{index} = $1;
-	}
-
-	if ($gas) {
-	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
-
-	    if (defined($self->{index})) {
-		sprintf "%s%s(%s,%%%s,%d)%s",
-					$self->{asterisk},$self->{label},
-					$self->{base}?"%$self->{base}":"",
-					$self->{index},$self->{scale},
-					$self->{opmask};
-	    } else {
-		sprintf "%s%s(%%%s)%s",	$self->{asterisk},$self->{label},
-					$self->{base},$self->{opmask};
-	    }
-	} else {
-	    $self->{label} =~ s/\./\$/g;
-	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
-	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-
-	    my $mnemonic = $self->{opcode}->mnemonic();
-	    ($self->{asterisk})				&& ($sz="q") ||
-	    ($mnemonic =~ /^v?mov([qd])$/)		&& ($sz=$1)  ||
-	    ($mnemonic =~ /^v?pinsr([qdwb])$/)		&& ($sz=$1)  ||
-	    ($mnemonic =~ /^vpbroadcast([qdwb])$/)	&& ($sz=$1)  ||
-	    ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/)
-							&& ($sz=$sifmap{$1});
-
-	    $self->{opmask}  =~ s/%(k[0-7])/$1/;
-
-	    if (defined($self->{index})) {
-		sprintf "%s[%s%s*%d%s]%s",$szmap{$sz},
-					$self->{label}?"$self->{label}+":"",
-					$self->{index},$self->{scale},
-					$self->{base}?"+$self->{base}":"",
-					$self->{opmask};
-	    } elsif ($self->{base} eq "rip") {
-		sprintf "%s[%s]",$szmap{$sz},$self->{label};
-	    } else {
-		sprintf "%s[%s%s]%s",	$szmap{$sz},
-					$self->{label}?"$self->{label}+":"",
-					$self->{base},$self->{opmask};
-	    }
-	}
-    }
-}
-{ package register;	# pick up registers, which start with %.
-    sub re {
-	my	($class, $line, $opcode) = @_;
-	my	$self = {};
-	my	$ret;
-
-	# optional * ----vvv--- appears in indirect jmp/call
-	if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) {
-	    bless $self,$class;
-	    $self->{asterisk} = $1;
-	    $self->{value} = $2;
-	    $self->{opmask} = $3;
-	    $opcode->size($self->size());
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-	}
-	$ret;
-    }
-    sub size {
-	my	$self = shift;
-	my	$ret;
-
-	if    ($self->{value} =~ /^r[\d]+b$/i)	{ $ret="b"; }
-	elsif ($self->{value} =~ /^r[\d]+w$/i)	{ $ret="w"; }
-	elsif ($self->{value} =~ /^r[\d]+d$/i)	{ $ret="l"; }
-	elsif ($self->{value} =~ /^r[\w]+$/i)	{ $ret="q"; }
-	elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; }
-	elsif ($self->{value} =~ /^[\w]{2}l$/i)	{ $ret="b"; }
-	elsif ($self->{value} =~ /^[\w]{2}$/i)	{ $ret="w"; }
-	elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; }
-
-	$ret;
-    }
-    sub out {
-    	my $self = shift;
-	if ($gas)	{ sprintf "%s%%%s%s",	$self->{asterisk},
-						$self->{value},
-						$self->{opmask}; }
-	else		{ $self->{opmask} =~ s/%(k[0-7])/$1/;
-			  $self->{value}.$self->{opmask}; }
-    }
-}
-{ package label;	# pick up labels, which end with :
-    sub re {
-	my	($class, $line) = @_;
-	my	$self = {};
-	my	$ret;
-
-	if ($$line =~ /(^[\.\w]+)\:/) {
-	    bless $self,$class;
-	    $self->{value} = $1;
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-
-	    $self->{value} =~ s/^\.L/$decor/;
-	}
-	$ret;
-    }
-    sub out {
-	my $self = shift;
-
-	if ($gas) {
-	    my $func = ($globals{$self->{value}} or $self->{value}) . ":";
-	    if ($current_function->{name} eq $self->{value}) {
-		$func .= "\n.cfi_".cfi_directive::startproc()   if ($dwarf);
-		$func .= "\n	.byte	0xf3,0x0f,0x1e,0xfa\n";	# endbranch
-		if ($win64 && $current_function->{abi} eq "svr4") {
-		    my $fp = $current_function->{unwind} ? "%r11" : "%rax";
-		    $func .= "	movq	%rdi,8(%rsp)\n";
-		    $func .= "	movq	%rsi,16(%rsp)\n";
-		    $func .= "	movq	%rsp,$fp\n";
-		    $func .= "${decor}SEH_begin_$current_function->{name}:\n";
-		    my $narg = $current_function->{narg};
-		    $narg=6 if (!defined($narg));
-		    $func .= "	movq	%rcx,%rdi\n" if ($narg>0);
-		    $func .= "	movq	%rdx,%rsi\n" if ($narg>1);
-		    $func .= "	movq	%r8,%rdx\n"  if ($narg>2);
-		    $func .= "	movq	%r9,%rcx\n"  if ($narg>3);
-		    $func .= "	movq	40(%rsp),%r8\n" if ($narg>4);
-		    $func .= "	movq	48(%rsp),%r9\n" if ($narg>5);
-		}
-	    }
-	    $func;
-	} elsif ($self->{value} ne "$current_function->{name}") {
-	    # Make all labels in masm global.
-	    $self->{value} .= ":" if ($masm);
-	    $self->{value} . ":";
-	} elsif ($win64 && $current_function->{abi} eq "svr4") {
-	    my $func =	"$current_function->{name}" .
-			($nasm ? ":" : "\tPROC $current_function->{scope}") .
-			"\n";
-	    my $fp = $current_function->{unwind} ? "r11" : "rax";
-	    $func .= "	DB	243,15,30,250\n";	# endbranch
-	    $func .= "	mov	QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n";
-	    $func .= "	mov	QWORD$PTR\[16+rsp\],rsi\n";
-	    $func .= "	mov	$fp,rsp\n";
-	    $func .= "${decor}SEH_begin_$current_function->{name}:";
-	    $func .= ":" if ($masm);
-	    $func .= "\n";
-	    my $narg = $current_function->{narg};
-	    $narg=6 if (!defined($narg));
-	    $func .= "	mov	rdi,rcx\n" if ($narg>0);
-	    $func .= "	mov	rsi,rdx\n" if ($narg>1);
-	    $func .= "	mov	rdx,r8\n"  if ($narg>2);
-	    $func .= "	mov	rcx,r9\n"  if ($narg>3);
-	    $func .= "	mov	r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4);
-	    $func .= "	mov	r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5);
-	    $func .= "\n";
-	} else {
-	   "$current_function->{name}".
-			($nasm ? ":" : "\tPROC $current_function->{scope}").
-	   "\n	DB	243,15,30,250";			# endbranch
-	}
-    }
-}
-{ package expr;		# pick up expressions
-    sub re {
-	my	($class, $line, $opcode) = @_;
-	my	$self = {};
-	my	$ret;
-
-	if ($$line =~ /(^[^,]+)/) {
-	    bless $self,$class;
-	    $self->{value} = $1;
-	    $ret = $self;
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-
-	    $self->{value} =~ s/\@PLT// if (!$elf);
-	    $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
-	    $self->{value} =~ s/\.L/$decor/g;
-	    $self->{opcode} = $opcode;
-	}
-	$ret;
-    }
-    sub out {
-	my $self = shift;
-	$self->{value};
-    }
-}
-
-my @xdata_seg = (".section	.xdata", ".align	8");
-my @pdata_seg = (".section	.pdata", ".align	4");
-
-{ package cfi_directive;
-    # CFI directives annotate instructions that are significant for
-    # stack unwinding procedure compliant with DWARF specification,
-    # see http://dwarfstd.org/. Besides naturally expected for this
-    # script platform-specific filtering function, this module adds
-    # three auxiliary synthetic directives not recognized by [GNU]
-    # assembler:
-    #
-    # - .cfi_push to annotate push instructions in prologue, which
-    #   translates to .cfi_adjust_cfa_offset (if needed) and
-    #   .cfi_offset;
-    # - .cfi_pop to annotate pop instructions in epilogue, which
-    #   translates to .cfi_adjust_cfa_offset (if needed) and
-    #   .cfi_restore;
-    # - [and most notably] .cfi_cfa_expression which encodes
-    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
-    #   byte vector;
-    #
-    # CFA expressions were introduced in DWARF specification version
-    # 3 and describe how to deduce CFA, Canonical Frame Address. This
-    # becomes handy if your stack frame is variable and you can't
-    # spare register for [previous] frame pointer. Suggested directive
-    # syntax is made-up mix of DWARF operator suffixes [subset of]
-    # and references to registers with optional bias. Following example
-    # describes offloaded *original* stack pointer at specific offset
-    # from *current* stack pointer:
-    #
-    #   .cfi_cfa_expression     %rsp+40,deref,+8
-    #
-    # Final +8 has everything to do with the fact that CFA is defined
-    # as reference to top of caller's stack, and on x86_64 call to
-    # subroutine pushes 8-byte return address. In other words original
-    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
-    #
-    # In addition the .cfi directives are re-purposed even for Win64
-    # stack unwinding. Two more synthetic directives were added:
-    #
-    # - .cfi_end_prologue to denote point when all non-volatile
-    #   registers are saved and stack or [chosen] frame pointer is
-    #   stable;
-    # - .cfi_epilogue to denote point when all non-volatile registers
-    #   are restored [and it even adds missing .cfi_restore-s];
-    #
-    # Though it's not universal "miracle cure," it has its limitations.
-    # Most notably .cfi_cfa_expression won't start working... For more
-    # information see the end of this file.
-
-    # Below constants are taken from "DWARF Expressions" section of the
-    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
-    my %DW_OP_simple = (	# no-arg operators, mapped directly
-	deref	=> 0x06,	dup	=> 0x12,
-	drop	=> 0x13,	over	=> 0x14,
-	pick	=> 0x15,	swap	=> 0x16,
-	rot	=> 0x17,	xderef	=> 0x18,
-
-	abs	=> 0x19,	and	=> 0x1a,
-	div	=> 0x1b,	minus	=> 0x1c,
-	mod	=> 0x1d,	mul	=> 0x1e,
-	neg	=> 0x1f,	not	=> 0x20,
-	or	=> 0x21,	plus	=> 0x22,
-	shl	=> 0x24,	shr	=> 0x25,
-	shra	=> 0x26,	xor	=> 0x27,
-	);
-
-    my %DW_OP_complex = (	# used in specific subroutines
-	constu		=> 0x10,	# uleb128
-	consts		=> 0x11,	# sleb128
-	plus_uconst	=> 0x23,	# uleb128
-	lit0 		=> 0x30,	# add 0-31 to opcode
-	reg0		=> 0x50,	# add 0-31 to opcode
-	breg0		=> 0x70,	# add 0-31 to opcole, sleb128
-	regx		=> 0x90,	# uleb28
-	fbreg		=> 0x91,	# sleb128
-	bregx		=> 0x92,	# uleb128, sleb128
-	piece		=> 0x93,	# uleb128
-	);
-
-    # Following constants are defined in x86_64 ABI supplement, for
-    # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf,
-    # see section 3.7 "Stack Unwind Algorithm".
-    my %DW_reg_idx = (
-	"%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
-	"%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
-	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
-	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
-	);
-
-    my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs);
-    my @cfa_stack;
-
-    # [us]leb128 format is variable-length integer representation base
-    # 2^128, with most significant bit of each byte being 0 denoting
-    # *last* most significant digit. See "Variable Length Data" in the
-    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
-    sub sleb128 {
-	use integer;	# get right shift extend sign
-
-	my $val = shift;
-	my $sign = ($val < 0) ? -1 : 0;
-	my @ret = ();
-
-	while(1) {
-	    push @ret, $val&0x7f;
-
-	    # see if remaining bits are same and equal to most
-	    # significant bit of the current digit, if so, it's
-	    # last digit...
-	    last if (($val>>6) == $sign);
-
-	    @ret[-1] |= 0x80;
-	    $val >>= 7;
-	}
-
-	return @ret;
-    }
-    sub uleb128 {
-	my $val = shift;
-	my @ret = ();
-
-	while(1) {
-	    push @ret, $val&0x7f;
-
-	    # see if it's last significant digit...
-	    last if (($val >>= 7) == 0);
-
-	    @ret[-1] |= 0x80;
-	}
-
-	return @ret;
-    }
-    sub const {
-	my $val = shift;
-
-	if ($val >= 0 && $val < 32) {
-            return ($DW_OP_complex{lit0}+$val);
-	}
-	return ($DW_OP_complex{consts}, sleb128($val));
-    }
-    sub reg {
-	my $val = shift;
-
-	return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
-
-	my $reg = $DW_reg_idx{$1};
-	my $off = eval ("0 $2 $3");
-
-	return (($DW_OP_complex{breg0} + $reg), sleb128($off));
-	# Yes, we use DW_OP_bregX+0 to push register value and not
-	# DW_OP_regX, because latter would require even DW_OP_piece,
-	# which would be a waste under the circumstances. If you have
-	# to use DWP_OP_reg, use "regx:N"...
-    }
-    sub cfa_expression {
-	my $line = shift;
-	my @ret;
-
-	foreach my $token (split(/,\s*/,$line)) {
-	    if ($token =~ /^%r/) {
-		push @ret,reg($token);
-	    } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
-		push @ret,reg("$2+$1");
-	    } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
-		my $i = 1*eval($2);
-		push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
-	    } elsif (my $i = 1*eval($token) or $token eq "0") {
-		if ($token =~ /^\+/) {
-		    push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
-		} else {
-		    push @ret,const($i);
-		}
-	    } else {
-		push @ret,$DW_OP_simple{$token};
-	    }
-	}
-
-	# Finally we return DW_CFA_def_cfa_expression, 15, followed by
-	# length of the expression and of course the expression itself.
-	return (15,scalar(@ret),@ret);
-    }
-
-    # Following constants are defined in "x64 exception handling" at
-    # https://docs.microsoft.com/ and match the register sequence in
-    # CONTEXT structure defined in winnt.h.
-    my %WIN64_reg_idx = (
-	"%rax"=>0,  "%rcx"=>1,  "%rdx"=>2,  "%rbx"=>3,
-	"%rsp"=>4,  "%rbp"=>5,  "%rsi"=>6,  "%rdi"=>7,
-	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
-	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
-	);
-    sub xdata {
-	our @dat = ();
-	our $len = 0;
-
-	sub allocstack {
-	    my $offset = shift;
-
-	    if ($offset) {
-		if ($offset <= 128) {
-	            $offset = ($offset - 8) >> 3;
-		    push @dat, [0,$offset<<4|2];	# UWOP_ALLOC_SMALL
-		} elsif ($offset < 0x80000) {
-		    push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))];
-		} else {
-		    push @dat, [0,0x11,unpack("C4",pack("V",$offset))];
-		}
-		$len += $#{@dat[-1]}+1;
-	    }
-	}
-
-	# allocate stack frame
-	if (my $offset = -8 - $cfa_rsp) {
-	    # but see if frame pointer is among saved registers
-	    if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) {
-	        $fp_off = -8 - $fp_off;
-		allocstack($fp_off-8);
-		$offset -= $fp_off;
-		push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL
-		$len += $#{@dat[-1]}+1;
-	    }
-	    allocstack($offset);
-	}
-	# set up frame pointer
-	my $fp_info = 0;
-	if ($cfa_reg ne "%rsp") {
-	    my $offset = $cfa_off - $cfa_rsp;
-	    ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset";
-	    $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg};
-	    push @dat, [0,3];				# UWOP_SET_FPREG
-	    $len += $#{@dat[-1]}+1;
-	}
-	# save registers
-	foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} }
-			      keys(%saved_regs)) {
-	    next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key);
-	    my $offset = $saved_regs{$key} - $cfa_rsp;
-	    if ($key =~ /%xmm([0-9]+)/) {
-		if ($offset < 0x100000) {
-		    push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))];
-		} else {
-		    push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))];
-		}
-	    } else {
-		if ($offset < 0x80000) {
-		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4,
-				unpack("C2",pack("v",$offset>>3))];
-		} else {
-		    push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5,
-				unpack("C4",pack("V",$offset))];
-		}
-	    }
-	    $len += $#{@dat[-1]}+1;
-	}
-
-	my @ret;
-	# generate 4-byte descriptor
-	push @ret, ".byte	1,0,".($len/2).",$fp_info";
-	$len += 4;
-	# pad to 8*n
-	unshift @dat, [(0)x((-$len)&7)] if ($len&7);
-	# emit data
-	while(defined(my $row = pop @dat)) {
-	    push @ret, ".byte	". join(",",
-					map { sprintf "0x%02x",$_ } @{$row});
-	}
-
-	return @ret;
-    }
-    sub startproc {
-	return if ($cfa_rsp == -8);
-	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8);
-	%saved_regs = ();
-	return "startproc";
-    }
-    sub endproc {
-	return if ($cfa_rsp == 0);
-	($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0);
-	%saved_regs = ();
-	return "endproc";
-    }
-    sub re {
-	my	($class, $line) = @_;
-	my	$self = {};
-	my	$ret;
-
-	if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
-	    bless $self,$class;
-	    $ret = $self;
-	    undef $self->{value};
-	    my $dir = $1;
-
-	    SWITCH: for ($dir) {
-	    # What is $cfa_rsp? Effectively it's difference between %rsp
-	    # value and current CFA, Canonical Frame Address, which is
-	    # why it starts with -8. Recall that CFA is top of caller's
-	    # stack...
-	    /startproc/	&& do {	$dir = startproc(); last; };
-	    /endproc/	&& do {	$dir = endproc();
-				# .cfi_remember_state directives that are not
-				# matched with .cfi_restore_state are
-				# unnecessary.
-				die "unpaired .cfi_remember_state" if (@cfa_stack);
-				last;
-			      };
-	    /def_cfa_register/
-			&& do {	$cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp");
-				$cfa_reg = $$line;
-				last;
-			      };
-	    /def_cfa_offset/
-			&& do {	$cfa_off = -1*eval($$line);
-				$cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
-				last;
-			      };
-	    /adjust_cfa_offset/
-			&& do { my $val = 1*eval($$line);
-				$cfa_off -= $val;
-				if ($cfa_reg eq "%rsp") {
-				    $cfa_rsp -= $val;
-				}
-				last;
-			      };
-	    /def_cfa/	&& do {	if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
-				    $cfa_reg = $1;
-				    $cfa_off = -1*eval($2);
-				    $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp");
-				}
-				last;
-			      };
-	    /push/	&& do {	$dir = undef;
-				$cfa_rsp -= 8;
-				if ($cfa_reg eq "%rsp") {
-				    $cfa_off = $cfa_rsp;
-				    $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
-				}
-				$saved_regs{$$line} = $cfa_rsp;
-				$self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
-				last;
-			      };
-	    /pop/	&& do {	$dir = undef;
-				$cfa_rsp += 8;
-				if ($cfa_reg eq "%rsp") {
-				    $cfa_off = $cfa_rsp;
-				    $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
-				}
-				$self->{value} .= ".cfi_restore\t$$line";
-				delete $saved_regs{$$line};
-				last;
-			      };
-	    /cfa_expression/
-			&& do {	$dir = undef;
-				$self->{value} = ".cfi_escape\t" .
-					join(",", map(sprintf("0x%02x", $_),
-						      cfa_expression($$line)));
-				last;
-			      };
-	    /remember_state/
-			&& do {	push @cfa_stack,
-				     [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs];
-				last;
-			      };
-	    /restore_state/
-			&& do {	     ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs)
-				= @{pop @cfa_stack};
-				last;
-			      };
-	    /offset/	&& do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) {
-				    $saved_regs{$1} = 1*eval($2);
-				    $dir = undef if ($1 =~ /%xmm/);
-				}
-				last;
-			      };
-	    /restore/	&& do {	delete $saved_regs{$$line}; last; };
-	    /end_prologue/
-			&& do {	$dir = undef;
-				$self->{win64} = ".endprolog";
-				last;
-			      };
-	    /epilogue/	&& do {	$dir = undef;
-				$self->{win64} = ".epilogue";
-				$self->{value} = join("\n",
-						      map { ".cfi_restore\t$_" }
-						      sort keys(%saved_regs));
-				%saved_regs = ();
-				last;
-			      };
-	    }
-
-	    $self->{value} = ".cfi_$dir\t$$line" if ($dir);
-
-	    $$line = "";
-	}
-
-	return $ret;
-    }
-    sub out {
-	my $self = shift;
-	return $self->{value} if ($dwarf);
-
-	if ($win64 and $current_function->{unwind}
-		   and my $ret = $self->{win64}) {
-	    my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/)  ? ($',    $cfa_off)
-							: ("rsp", $cfa_rsp);
-	    my $fname = $current_function->{name};
-
-	    if ($ret eq ".endprolog") {
-		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
-		$saved_regs{"%rsi"} = 8;
-
-		push @pdata_seg,
-		    ".rva	.LSEH_begin_${fname}",
-		    ".rva	.LSEH_body_${fname}",
-		    ".rva	.LSEH_info_${fname}_prologue","";
-		push @xdata_seg,
-		    ".LSEH_info_${fname}_prologue:",
-		    ".byte	1,0,5,0x0b",	# 5 unwind codes, %r11 is FP
-		    ".byte	0,0x74,1,0",	# %rdi at 8(%rsp)
-		    ".byte	0,0x64,2,0",	# %rsi at 16(%rsp)
-		    ".byte	0,0x03",	# set frame pointer
-		    ".byte	0,0"		# padding
-		    ;
-		push @pdata_seg,
-		    ".rva	.LSEH_body_${fname}",
-		    ".rva	.LSEH_epilogue_${fname}",
-		    ".rva	.LSEH_info_${fname}_body","";
-		push @xdata_seg,".LSEH_info_${fname}_body:", xdata();
-		$ret  = "${decor}SEH_body_${fname}:";
-		$ret .= ":" if ($masm); $ret .= "\n";
-	    } elsif ($ret eq ".epilogue") {
-		%saved_regs = ();
-		$saved_regs{"%rdi"} = 0;	# relative to CFA, remember?
-		$saved_regs{"%rsi"} = 8;
-		$cfa_rsp = $cfa_off;
-
-		push @pdata_seg,
-		    ".rva	.LSEH_epilogue_${fname}",
-		    ".rva	.LSEH_end_${fname}",
-		    ".rva	.LSEH_info_${fname}_epilogue","";
-		push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), "";
-		$ret  = "${decor}SEH_epilogue_${fname}:";
-		$ret .= ":" if ($masm); $ret .= "\n";
-		if ($gas) {
-		    $ret .= "	mov	".(0-$off)."(%$reg),%rdi\n";
-		    $ret .= "	mov	".(8-$off)."(%$reg),%rsi\n";
-		} else {
-		    $ret .= "	mov	rdi,QWORD$PTR\[".(0-$off)."+$reg\]";
-		    $ret .= "	;WIN64 epilogue\n";
-		    $ret .= "	mov	rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n";
-		}
-	    }
-	    return $ret;
-	}
-	return;
-    }
-}
-{ package directive;	# pick up directives, which start with .
-    sub re {
-	my	($class, $line) = @_;
-	my	$self = {};
-	my	$ret;
-	my	$dir;
-
-	# chain-call to cfi_directive
-	$ret = cfi_directive->re($line) and return $ret;
-
-	if ($$line =~ /^\s*(\.\w+)/) {
-	    bless $self,$class;
-	    $dir = $1;
-	    $ret = $self;
-	    undef $self->{value};
-	    $$line = substr($$line,@+[0]); $$line =~ s/^\s+//;
-
-	    SWITCH: for ($dir) {
-		/\.global|\.globl|\.extern/
-			    && do { $globals{$$line} = $prefix . $$line;
-				    $$line = $globals{$$line} if ($prefix);
-				    last;
-				  };
-		/\.type/    && do { my ($sym,$type,$narg,$unwind) = split(',',$$line);
-				    if ($type eq "\@function") {
-					undef $current_function;
-					$current_function->{name} = $sym;
-					$current_function->{abi}  = "svr4";
-					$current_function->{narg} = $narg;
-					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
-					$current_function->{unwind} = $unwind;
-				    } elsif ($type eq "\@abi-omnipotent") {
-					undef $current_function;
-					$current_function->{name} = $sym;
-					$current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE";
-				    }
-				    $$line =~ s/\@abi\-omnipotent/\@function/;
-				    $$line =~ s/\@function.*/\@function/;
-				    last;
-				  };
-		/\.asciz/   && do { if ($$line =~ /^"(.*)"$/) {
-					$dir  = ".byte";
-					$$line = join(",",unpack("C*",$1),0);
-				    }
-				    last;
-				  };
-		/\.rva|\.long|\.quad/
-			    && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei;
-				    $$line =~ s/\.L/$decor/g;
-				    last;
-				  };
-	    }
-
-	    if ($gas) {
-		$self->{value} = $dir . "\t" . $$line;
-
-		if ($dir =~ /\.extern/) {
-		    $self->{value} = ""; # swallow extern
-		} elsif (!$elf && $dir =~ /\.type/) {
-		    $self->{value} = "";
-		    $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" .
-				(defined($globals{$1})?".scl 2;":".scl 3;") .
-				"\t.type 32;\t.endef"
-				if ($win64 && $$line =~ /([^,]+),\@function/);
-		} elsif ($dir =~ /\.size/) {
-		    $self->{value} = "" if (!$elf);
-		    if ($dwarf and my $endproc = cfi_directive::endproc()) {
-			$self->{value} = ".cfi_$endproc\n$self->{value}";
-		    } elsif (!$elf && defined($current_function)) {
-			$self->{value} .= "${decor}SEH_end_$current_function->{name}:"
-				if ($win64 && $current_function->{abi} eq "svr4");
-			undef $current_function;
-		    }
-		} elsif (!$elf && $dir =~ /\.align/) {
-		    $self->{value} = ".p2align\t" . (log($$line)/log(2));
-		} elsif ($dir eq ".section") {
-		    $current_segment=$$line;
-		    if (!$elf && $current_segment eq ".init") {
-			if	($flavour eq "macosx")	{ $self->{value} = ".mod_init_func"; }
-			elsif	($flavour eq "mingw64")	{ $self->{value} = ".section\t.ctors"; }
-		    }
-		} elsif ($dir =~ /\.(text|data)/) {
-		    $current_segment=".$1";
-		} elsif ($dir =~ /\.hidden/) {
-		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$$line"; }
-		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
-		} elsif ($dir =~ /\.comm/) {
-		    $self->{value} = "$dir\t$prefix$$line";
-		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
-		}
-		$$line = "";
-		return $self;
-	    }
-
-	    # non-gas case or nasm/masm
-	    SWITCH: for ($dir) {
-		/\.text/    && do { my $v=undef;
-				    if ($nasm) {
-					$v="section	.text code align=64\n";
-				    } else {
-					$v="$current_segment\tENDS\n" if ($current_segment);
-					$current_segment = ".text\$";
-					$v.="$current_segment\tSEGMENT ";
-					$v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE";
-					$v.=" 'CODE'";
-				    }
-				    $self->{value} = $v;
-				    last;
-				  };
-		/\.data/    && do { my $v=undef;
-				    if ($nasm) {
-					$v="section	.data data align=8\n";
-				    } else {
-					$v="$current_segment\tENDS\n" if ($current_segment);
-					$current_segment = "_DATA";
-					$v.="$current_segment\tSEGMENT";
-				    }
-				    $self->{value} = $v;
-				    last;
-				  };
-		/\.section/ && do { my $v=undef;
-				    $$line =~ s/([^,]*).*/$1/;
-				    $$line = ".CRT\$XCU" if ($$line eq ".init");
-				    if ($nasm) {
-					$v="section	$$line";
-					if ($$line=~/\.([px])data/) {
-					    $v.=" rdata align=";
-					    $v.=$1 eq "p"? 4 : 8;
-					} elsif ($$line=~/\.CRT\$/i) {
-					    $v.=" rdata align=8";
-					}
-				    } else {
-					$v="$current_segment\tENDS\n" if ($current_segment);
-					$v.="$$line\tSEGMENT";
-					if ($$line=~/\.([px])data/) {
-					    $v.=" READONLY";
-					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
-					} elsif ($$line=~/\.CRT\$/i) {
-					    $v.=" READONLY ";
-					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
-					}
-				    }
-				    $current_segment = $$line;
-				    $self->{value} = $v;
-				    last;
-				  };
-		/\.extern/  && do { $self->{value}  = "EXTERN\t".$$line;
-				    $self->{value} .= ":NEAR" if ($masm);
-				    last;
-				  };
-		/\.globl|.global/
-			    && do { $self->{value}  = $masm?"PUBLIC":"global";
-				    $self->{value} .= "\t".$$line;
-				    last;
-				  };
-		/\.size/    && do { if (defined($current_function)) {
-					undef $self->{value};
-					if ($current_function->{abi} eq "svr4") {
-					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
-					    $self->{value}.=":\n" if($masm);
-					}
-					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
-					undef $current_function;
-				    }
-				    last;
-				  };
-		/\.align/   && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096;
-				    $self->{value} = "ALIGN\t".($$line>$max?$max:$$line);
-				    last;
-				  };
-		/\.(value|long|rva|quad)/
-			    && do { my $sz  = substr($1,0,1);
-				    my @arr = split(/,\s*/,$$line);
-				    my $last = pop(@arr);
-				    my $conv = sub  {	my $var=shift;
-							$var=~s/^(0b[0-1]+)/oct($1)/eig;
-							$var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm);
-							if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
-							{ $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
-							$var;
-						    };
-
-				    $sz =~ tr/bvlrq/BWDDQ/;
-				    $self->{value} = "\tD$sz\t";
-				    for (@arr) { $self->{value} .= &$conv($_).","; }
-				    $self->{value} .= &$conv($last);
-				    last;
-				  };
-		/\.byte/    && do { my @str=split(/,\s*/,$$line);
-				    map(s/(0b[0-1]+)/oct($1)/eig,@str);
-				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
-				    while ($#str>15) {
-					$self->{value}.="DB\t"
-						.join(",",@str[0..15])."\n";
-					foreach (0..15) { shift @str; }
-				    }
-				    $self->{value}.="DB\t"
-						.join(",",@str) if (@str);
-				    last;
-				  };
-		/\.comm/    && do { my @str=split(/,\s*/,$$line);
-				    my $v=undef;
-				    if ($nasm) {
-					$v.="common	$prefix@str[0] @str[1]";
-				    } else {
-					$v="$current_segment\tENDS\n" if ($current_segment);
-					$current_segment = "_DATA";
-					$v.="$current_segment\tSEGMENT\n";
-					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
-				    }
-				    $self->{value} = $v;
-				    last;
-				  };
-	    }
-	    $$line = "";
-	}
-
-	$ret;
-    }
-    sub out {
-	my $self = shift;
-	$self->{value};
-    }
-}
-
-# Upon initial x86_64 introduction SSE>2 extensions were not introduced
-# yet. In order not to be bothered by tracing exact assembler versions,
-# but at the same time to provide a bare security minimum of AES-NI, we
-# hard-code some instructions. Extensions past AES-NI on the other hand
-# are traced by examining assembler version in individual perlasm
-# modules...
-
-my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
-		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
-
-sub rex {
- my $opcode=shift;
- my ($dst,$src,$rex)=@_;
-
-   $rex|=0x04 if($dst>=8);
-   $rex|=0x01 if($src>=8);
-   push @$opcode,($rex|0x40) if ($rex);
-}
-
-my $movq = sub {	# elderly gas can't handle inter-register movq
-  my $arg = shift;
-  my @opcode=(0x66);
-    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
-	my ($src,$dst)=($1,$2);
-	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,$src,$dst,0x8);
-	push @opcode,0x0f,0x7e;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	@opcode;
-    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
-	my ($src,$dst)=($2,$1);
-	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,$src,$dst,0x8);
-	push @opcode,0x0f,0x6e;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pextrd = sub {
-    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
-      my @opcode=(0x66);
-	my $imm=$1;
-	my $src=$2;
-	my $dst=$3;
-	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
-	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
-	rex(\@opcode,$src,$dst);
-	push @opcode,0x0f,0x3a,0x16;
-	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
-	push @opcode,$imm;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pinsrd = sub {
-    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	my $imm=$1;
-	my $src=$2;
-	my $dst=$3;
-	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
-	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
-	rex(\@opcode,$dst,$src);
-	push @opcode,0x0f,0x3a,0x22;
-	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
-	push @opcode,$imm;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pshufb = sub {
-    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$2,$1);
-	push @opcode,0x0f,0x38,0x00;
-	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $palignr = sub {
-    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$3,$2);
-	push @opcode,0x0f,0x3a,0x0f;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	push @opcode,$1;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $pclmulqdq = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x66);
-	rex(\@opcode,$3,$2);
-	push @opcode,0x0f,0x3a,0x44;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $rdrand = sub {
-    if (shift =~ /%[er](\w+)/) {
-      my @opcode=();
-      my $dst=$1;
-	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$dst,8);
-	push @opcode,0x0f,0xc7,0xf0|($dst&7);
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $rdseed = sub {
-    if (shift =~ /%[er](\w+)/) {
-      my @opcode=();
-      my $dst=$1;
-	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$dst,8);
-	push @opcode,0x0f,0xc7,0xf8|($dst&7);
-	@opcode;
-    } else {
-	();
-    }
-};
-
-# Not all AVX-capable assemblers recognize AMD XOP extension. Since we
-# are using only two instructions hand-code them in order to be excused
-# from chasing assembler versions...
-
-sub rxb {
- my $opcode=shift;
- my ($dst,$src1,$src2,$rxb)=@_;
-
-   $rxb|=0x7<<5;
-   $rxb&=~(0x04<<5) if($dst>=8);
-   $rxb&=~(0x01<<5) if($src1>=8);
-   $rxb&=~(0x02<<5) if($src2>=8);
-   push @$opcode,$rxb;
-}
-
-my $vprotd = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x8f);
-	rxb(\@opcode,$3,$2,-1,0x08);
-	push @opcode,0x78,0xc2;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-my $vprotq = sub {
-    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
-      my @opcode=(0x8f);
-	rxb(\@opcode,$3,$2,-1,0x08);
-	push @opcode,0x78,0xc3;
-	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
-	my $c=$1;
-	push @opcode,$c=~/^0/?oct($c):$c;
-	@opcode;
-    } else {
-	();
-    }
-};
-
-# Intel Control-flow Enforcement Technology extension. All functions and
-# indirect branch targets will have to start with this instruction...
-# However, it should not be used in functions' prologues explicitly, as
-# it's added automatically [and in the right spot]. Which leaves only
-# non-function indirect branch targets, such as in a case-like dispatch
-# table, as application area.
-
-my $endbr64 = sub {
-    (0xf3,0x0f,0x1e,0xfa);
-};
-
-########################################################################
-
-if ($nasm) {
-    print <<___;
-default	rel
-%define XMMWORD
-%define YMMWORD
-%define ZMMWORD
-___
-} elsif ($masm) {
-    print <<___;
-OPTION	DOTNAME
-___
-}
-
-sub process {
-    my $line = shift;
-
-    $line =~ s|\R$||;           # Better chomp
-
-    $line =~ s|[#!].*$||;	# get rid of asm-style comments...
-    $line =~ s|/\*.*\*/||;	# ... and C-style comments...
-    $line =~ s|^\s+||;		# ... and skip white spaces in beginning
-    $line =~ s|\s+$||;		# ... and at the end
-
-    if (my $label=label->re(\$line))	{ print $label->out(); }
-
-    if (my $directive=directive->re(\$line)) {
-	printf "%s",$directive->out();
-    } elsif (my $opcode=opcode->re(\$line)) {
-	my $asm = eval("\$".$opcode->mnemonic());
-
-	if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
-	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
-	    next;
-	}
-
-	my @args;
-	ARGUMENT: while (1) {
-	    my $arg;
-
-	    ($arg=register->re(\$line, $opcode))||
-	    ($arg=const->re(\$line))		||
-	    ($arg=ea->re(\$line, $opcode))	||
-	    ($arg=expr->re(\$line, $opcode))	||
-	    last ARGUMENT;
-
-	    push @args,$arg;
-
-	    last ARGUMENT if ($line !~ /^,/);
-
-	    $line =~ s/^,\s*//;
-	} # ARGUMENT:
-
-	if ($#args>=0) {
-	    my $insn;
-	    my $sz=$opcode->size();
-
-	    if ($gas) {
-		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
-		@args = map($_->out($sz),@args);
-		printf "\t%s\t%s",$insn,join(",",@args);
-	    } else {
-		$insn = $opcode->out();
-		foreach (@args) {
-		    my $arg = $_->out();
-		    # $insn.=$sz compensates for movq, pinsrw, ...
-		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
-		    if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
-		    if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; }
-		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
-		}
-		@args = reverse(@args);
-		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
-		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
-	    }
-	} else {
-	    printf "\t%s",$opcode->out();
-	}
-    }
-
-    print $line,"\n";
-}
-
-while(<>) { process($_); }
-
-map { process($_) } @pdata_seg if ($win64);
-map { process($_) } @xdata_seg if ($win64);
-
-# platform-specific epilogue
-if ($masm) {
-    print "\n$current_segment\tENDS\n"	if ($current_segment);
-    print "END\n";
-} elsif ($elf) {
-    # -fcf-protection segment, snatched from compiler -S output
-    my $align = ($flavour =~ /elf32/) ? 4 : 8;
-    print <<___;
-
-.section	.note.GNU-stack,"",\@progbits
-.section	.note.gnu.property,"a",\@note
-	.long	4,2f-1f,5
-	.byte	0x47,0x4E,0x55,0
-1:	.long	0xc0000002,4,3
-.align	$align
-2:
-___
-}
-
-close STDOUT;
-
-#################################################
-# Cross-reference x86_64 ABI "card"
-#
-# 		Unix		Win64
-# %rax		*		*
-# %rbx		-		-
-# %rcx		#4		#1
-# %rdx		#3		#2
-# %rsi		#2		-
-# %rdi		#1		-
-# %rbp		-		-
-# %rsp		-		-
-# %r8		#5		#3
-# %r9		#6		#4
-# %r10		*		*
-# %r11		*		*
-# %r12		-		-
-# %r13		-		-
-# %r14		-		-
-# %r15		-		-
-#
-# (*)	volatile register
-# (-)	preserved by callee
-# (#)	Nth argument, volatile
-#
-# In Unix terms top of stack is argument transfer area for arguments
-# which could not be accommodated in registers. Or in other words 7th
-# [integer] argument resides at 8(%rsp) upon function entry point.
-# 128 bytes above %rsp constitute a "red zone" which is not touched
-# by signal handlers and can be used as temporal storage without
-# allocating a frame.
-#
-# In Win64 terms N*8 bytes on top of stack is argument transfer area,
-# which belongs to/can be overwritten by callee. N is the number of
-# arguments passed to callee, *but* not less than 4! This means that
-# upon function entry point 5th argument resides at 40(%rsp), as well
-# as that 32 bytes from 8(%rsp) can always be used as temporal
-# storage [without allocating a frame]. One can actually argue that
-# one can assume a "red zone" above stack pointer under Win64 as well.
-# Point is that at apparently no occasion Windows kernel would alter
-# the area above user stack pointer in true asynchronous manner...
-#
-# All the above means that if assembler programmer adheres to Unix
-# register and stack layout, but disregards the "red zone" existence,
-# it's possible to use following prologue and epilogue to "gear" from
-# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
-#
-# omnipotent_function:
-# ifdef WIN64
-#	movq	%rdi,8(%rsp)
-#	movq	%rsi,16(%rsp)
-#	movq	%rcx,%rdi	; if 1st argument is actually present
-#	movq	%rdx,%rsi	; if 2nd argument is actually ...
-#	movq	%r8,%rdx	; if 3rd argument is ...
-#	movq	%r9,%rcx	; if 4th argument ...
-#	movq	40(%rsp),%r8	; if 5th ...
-#	movq	48(%rsp),%r9	; if 6th ...
-# endif
-#	...
-# ifdef WIN64
-#	movq	8(%rsp),%rdi
-#	movq	16(%rsp),%rsi
-# endif
-#	ret
-#
-#################################################
-# Win64 SEH, Structured Exception Handling.
-#
-# Unlike on Unix systems(*) lack of Win64 stack unwinding information
-# has undesired side-effect at run-time: if an exception is raised in
-# assembler subroutine such as those in question (basically we're
-# referring to segmentation violations caused by malformed input
-# parameters), the application is briskly terminated without invoking
-# any exception handlers, most notably without generating memory dump
-# or any user notification whatsoever. This poses a problem. It's
-# possible to address it by registering custom language-specific
-# handler that would restore processor context to the state at
-# subroutine entry point and return "exception is not handled, keep
-# unwinding" code. Writing such handler can be a challenge... But it's
-# doable, though requires certain coding convention. Consider following
-# snippet:
-#
-# .type	function,@function
-# function:
-#	movq	%rsp,%rax	# copy rsp to volatile register
-#	pushq	%r15		# save non-volatile registers
-#	pushq	%rbx
-#	pushq	%rbp
-#	movq	%rsp,%r11
-#	subq	%rdi,%r11	# prepare [variable] stack frame
-#	andq	$-64,%r11
-#	movq	%rax,0(%r11)	# check for exceptions
-#	movq	%r11,%rsp	# allocate [variable] stack frame
-#	movq	%rax,0(%rsp)	# save original rsp value
-# magic_point:
-#	...
-#	movq	0(%rsp),%rcx	# pull original rsp value
-#	movq	-24(%rcx),%rbp	# restore non-volatile registers
-#	movq	-16(%rcx),%rbx
-#	movq	-8(%rcx),%r15
-#	movq	%rcx,%rsp	# restore original rsp
-# magic_epilogue:
-#	ret
-# .size function,.-function
-#
-# The key is that up to magic_point copy of original rsp value remains
-# in chosen volatile register and no non-volatile register, except for
-# rsp, is modified. While past magic_point rsp remains constant till
-# the very end of the function. In this case custom language-specific
-# exception handler would look like this:
-#
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-# {	ULONG64 *rsp = (ULONG64 *)context->Rax;
-#	ULONG64  rip = context->Rip;
-#
-#	if (rip >= magic_point)
-#	{   rsp = (ULONG64 *)context->Rsp;
-#	    if (rip < magic_epilogue)
-#	    {	rsp = (ULONG64 *)rsp[0];
-#		context->Rbp = rsp[-3];
-#		context->Rbx = rsp[-2];
-#		context->R15 = rsp[-1];
-#	    }
-#	}
-#	context->Rsp = (ULONG64)rsp;
-#	context->Rdi = rsp[1];
-#	context->Rsi = rsp[2];
-#
-#	memcpy (disp->ContextRecord,context,sizeof(CONTEXT));
-#	RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase,
-#		dips->ControlPc,disp->FunctionEntry,disp->ContextRecord,
-#		&disp->HandlerData,&disp->EstablisherFrame,NULL);
-#	return ExceptionContinueSearch;
-# }
-#
-# It's appropriate to implement this handler in assembler, directly in
-# function's module. In order to do that one has to know members'
-# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant
-# values. Here they are:
-#
-#	CONTEXT.Rax				120
-#	CONTEXT.Rcx				128
-#	CONTEXT.Rdx				136
-#	CONTEXT.Rbx				144
-#	CONTEXT.Rsp				152
-#	CONTEXT.Rbp				160
-#	CONTEXT.Rsi				168
-#	CONTEXT.Rdi				176
-#	CONTEXT.R8				184
-#	CONTEXT.R9				192
-#	CONTEXT.R10				200
-#	CONTEXT.R11				208
-#	CONTEXT.R12				216
-#	CONTEXT.R13				224
-#	CONTEXT.R14				232
-#	CONTEXT.R15				240
-#	CONTEXT.Rip				248
-#	CONTEXT.Xmm6				512
-#	sizeof(CONTEXT)				1232
-#	DISPATCHER_CONTEXT.ControlPc		0
-#	DISPATCHER_CONTEXT.ImageBase		8
-#	DISPATCHER_CONTEXT.FunctionEntry	16
-#	DISPATCHER_CONTEXT.EstablisherFrame	24
-#	DISPATCHER_CONTEXT.TargetIp		32
-#	DISPATCHER_CONTEXT.ContextRecord	40
-#	DISPATCHER_CONTEXT.LanguageHandler	48
-#	DISPATCHER_CONTEXT.HandlerData		56
-#	UNW_FLAG_NHANDLER			0
-#	ExceptionContinueSearch			1
-#
-# In order to tie the handler to the function one has to compose
-# couple of structures: one for .xdata segment and one for .pdata.
-#
-# UNWIND_INFO structure for .xdata segment would be
-#
-# function_unwind_info:
-#	.byte	9,0,0,0
-#	.rva	handler
-#
-# This structure designates exception handler for a function with
-# zero-length prologue, no stack frame or frame register.
-#
-# To facilitate composing of .pdata structures, auto-generated "gear"
-# prologue copies rsp value to rax and denotes next instruction with
-# .LSEH_begin_{function_name} label. This essentially defines the SEH
-# styling rule mentioned in the beginning. Position of this label is
-# chosen in such manner that possible exceptions raised in the "gear"
-# prologue would be accounted to caller and unwound from latter's frame.
-# End of function is marked with respective .LSEH_end_{function_name}
-# label. To summarize, .pdata segment would contain
-#
-#	.rva	.LSEH_begin_function
-#	.rva	.LSEH_end_function
-#	.rva	function_unwind_info
-#
-# Reference to function_unwind_info from .xdata segment is the anchor.
-# In case you wonder why references are 32-bit .rvas and not 64-bit
-# .quads. References put into these two segments are required to be
-# *relative* to the base address of the current binary module, a.k.a.
-# image base. No Win64 module, be it .exe or .dll, can be larger than
-# 2GB and thus such relative references can be and are accommodated in
-# 32 bits.
-#
-# Having reviewed the example function code, one can argue that "movq
-# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix
-# rax would contain an undefined value. If this "offends" you, use
-# another register and refrain from modifying rax till magic_point is
-# reached, i.e. as if it was a non-volatile register. If more registers
-# are required prior [variable] frame setup is completed, note that
-# nobody says that you can have only one "magic point." You can
-# "liberate" non-volatile registers by denoting last stack off-load
-# instruction and reflecting it in finer grade unwind logic in handler.
-# After all, isn't it why it's called *language-specific* handler...
-#
-# SE handlers are also involved in unwinding stack when executable is
-# profiled or debugged. Profiling implies additional limitations that
-# are too subtle to discuss here. For now it's sufficient to say that
-# in order to simplify handlers one should either a) offload original
-# %rsp to stack (like discussed above); or b) if you have a register to
-# spare for frame pointer, choose volatile one.
-#
-# (*)	Note that we're talking about run-time, not debug-time. Lack of
-#	unwind information makes debugging hard on both Windows and
-#	Unix. "Unlike" refers to the fact that on Unix signal handler
-#	will always be invoked, core dumped and appropriate exit code
-#	returned to parent (for user notification).
-#
-########################################################################
-# As of May 2020 an alternative approach that works with both exceptions
-# and debugging/profiling was implemented by re-purposing DWARF .cfi
-# annotations even for Win64 unwind tables' generation. Unfortunately,
-# but not really unexpectedly, it imposes additional limitations on
-# coding style. Probably most significant limitation is that frame
-# pointer has to be at 16*n distance from stack pointer at the exit
-# from prologue. But first things first. There are two additional
-# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue,
-# that need to be added to all functions marked with additional .type
-# tag (see example below). There are "do's and don'ts" for prologue
-# and epilogue. It shouldn't come as surprise that in prologue one may
-# not modify non-volatile registers, but one may not modify %r11 either.
-# This is because it's used as temporary frame pointer(*). There is one
-# exception to this rule, and it's setting up frame pointer that is
-# non-volatile or %r11. But it must be last instruction in the prologue.
-# Constraints for epilogue, or rather on its boundary, depend on whether
-# the frame is fixed- or variable-length. In fixed-frame subroutine
-# stack pointer has to be restored in the last instruction prior the
-# .cfi_epilogue directive. If it's variable-frame subroutine, and a
-# non-volatile register was used as frame pointer, then last instruction
-# prior the directive has to restore its original value. This means that
-# final stack pointer adjustment would have to be pushed past the
-# directive. Normally this would render the epilogue non-unwindable, so
-# special care has to be taken. To resolve the dilemma, copy frame
-# pointer to a volatile register in advance. To give an example:
-#
-# .type	rbp_as_frame_pointer,\@function,3,"unwind"  # mind extra tag!
-# rbp_as_frame_pointer:
-# .cfi_startproc
-#	push	%rbp
-# .cfi_push	%rbp
-#	push	%rbx
-# .cfi_push	%rbx
-# 	mov	%rsp,%rbp	# last instruction in prologue
-# .cfi_def_cfa_register	%rbp	# %rsp-%rbp has to be 16*n, e.g. 16*0
-# .cfi_end_prologue
-#	sub	\$40,%rsp
-#	and	\$-64,%rsp
-#	...
-#	mov	%rbp,%r11
-# .cfi_def_cfa_register	%r11	# copy frame pointer to volatile %r11
-#	mov	0(%rbp),%rbx
-#	mov	8(%rbp),%rbp	# last instruction prior epilogue
-# .cfi_epilogue			# may not change %r11 in epilogue
-#	lea	16(%r11),%rsp
-#	ret
-# .cfi_endproc
-# .size	rbp_as_frame_pointer,.-rbp_as_frame_pointer
-#
-# To give an example of fixed-frame subroutine for reference:
-#
-# .type	fixed_frame,\@function,3,"unwind"           # mind extra tag!
-# fixed_frame:
-# .cfi_startproc
-#	push	%rbp
-# .cfi_push	%rbp
-#	push	%rbx
-# .cfi_push	%rbx
-#	sub	\$40,%rsp
-# .cfi_adjust_cfa_offset 40
-# .cfi_end_prologue
-#	...
-#	mov	40(%rsp),%rbx
-#	mov	48(%rsp),%rbp
-#	lea	56(%rsp),%rsp
-# .cfi_adjust_cfa_offset -56
-# .cfi_epilogue
-#	ret
-# .cfi_endproc
-# .size	fixed_frame,.-fixed_frame
-#
-# As for epilogue itself, one can only work on non-volatile registers.
-# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi.
-#
-# On a final note, mixing old-style and modernized subroutines in the
-# same file takes some trickery. Ones of the new kind have to appear
-# after old-style ones. This has everything to do with the fact that
-# entries in the .pdata segment have to appear in strictly same order
-# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION
-# structures get mechanically appended to whatever existing .pdata.
-#
-# (*)	Just in case, why %r11 and not %rax. This has everything to do
-#	with the way UNWIND_INFO is, one just can't designate %rax as
-#	frame pointer.
diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c
index 0fcf563f502..df11e3dae73 100644
--- a/crypto/blst_src/client_min_pk.c
+++ b/crypto/blst_src/client_min_pk.c
@@ -4,7 +4,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "keygen.c"
+/*#include "keygen.c"
 #include "e2.c"
 #include "hash_to_field.c"
 #include "map_to_g2.c"
@@ -14,4 +14,4 @@
 #include "recip.c"
 #include "consts.c"
 #include "vect.c"
-#include "exports.c"
+#include "exports.c"*/
diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c
index 8e4663daede..fffbd5ad52d 100644
--- a/crypto/blst_src/client_min_sig.c
+++ b/crypto/blst_src/client_min_sig.c
@@ -4,7 +4,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "keygen.c"
+/*#include "keygen.c"
 #include "e1.c"
 #include "hash_to_field.c"
 #include "map_to_g1.c"
@@ -14,4 +14,4 @@
 #include "recip.c"
 #include "consts.c"
 #include "vect.c"
-#include "exports.c"
+#include "exports.c"*/
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 451c1b8a180..175fe5acb0a 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -3,7 +3,7 @@
 
 package crypto
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
 
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 335ce6fc86d..ecb26f7d6e3 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -3,7 +3,7 @@
 
 package crypto
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
 
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 7b63f88e810..d79379f7d83 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -3,7 +3,7 @@
 
 package crypto
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
 import "C"
diff --git a/crypto/spock.go b/crypto/spock.go
index 2487f39ce1b..a4087316319 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -6,7 +6,7 @@ package crypto
 // SPoCK design based on the BLS signature scheme.
 // BLS is using BLS12-381 curve and the same settings in bls.go.
 
-// #cgo CFLAGS: -g -Wall -std=c99
+// #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls_include.h"
 import "C"

From 3c0247cfc9c72c0c9e75231b9769fb0e84fff26d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 14 Feb 2023 01:06:20 -0600
Subject: [PATCH 007/200] include blst.h

---
 crypto/blst_include.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 586f6069590..4ac79c7723b 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -4,4 +4,6 @@
 // blst related definitions
 // eventually this file would replace blst.h
 
+#include "blst.h"
+
 #endif
\ No newline at end of file

From 79601b66a999ef80067af66063085a929fde7f65 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 14 Feb 2023 01:09:09 -0600
Subject: [PATCH 008/200] tidy go.mod after removing blst package

---
 crypto/go.mod | 2 --
 crypto/go.sum | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/crypto/go.mod b/crypto/go.mod
index c7fe54f9ff5..57c20ef9341 100644
--- a/crypto/go.mod
+++ b/crypto/go.mod
@@ -6,10 +6,8 @@ require (
 	github.com/btcsuite/btcd/btcec/v2 v2.2.1
 	github.com/sirupsen/logrus v1.4.2
 	github.com/stretchr/testify v1.8.0
-	github.com/supranational/blst v0.3.10
 	golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d
 	gonum.org/v1/gonum v0.6.1
-	pgregory.net/rapid v0.4.7
 )
 
 require (
diff --git a/crypto/go.sum b/crypto/go.sum
index 19a05d05d6d..181f9b302c0 100644
--- a/crypto/go.sum
+++ b/crypto/go.sum
@@ -28,8 +28,6 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk=
-github.com/supranational/blst v0.3.10/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -54,6 +52,4 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
-pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

From e64cc36a82377a77b594db460799b892de1f8cab Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 14 Feb 2023 01:14:40 -0600
Subject: [PATCH 009/200] add missing relic flags

---
 crypto/blst_include.h |  2 ++
 crypto/blst_src.c     | 10 +---------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 4ac79c7723b..dde3acd5f05 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -1,3 +1,5 @@
+// +build relic
+
 #ifndef __BLST_INCLUDE_H__
 #define __BLST_INCLUDE_H__
 
diff --git a/crypto/blst_src.c b/crypto/blst_src.c
index c124bcec078..89388b703fe 100644
--- a/crypto/blst_src.c
+++ b/crypto/blst_src.c
@@ -1,8 +1,4 @@
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
+// +build relic
 
 #include "keygen.c"
 #include "hash_to_field.c"
@@ -21,7 +17,3 @@
 #include "consts.c"
 #include "vect.c"
 #include "exports.c"
-#include "rb_tree.c"
-#ifdef BLST_FR_PENTAROOT
-# include "pentaroot.c"
-#endif

From e51d94e1744b6d0c92a76b4db77c2ac8d093df24 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 17 Feb 2023 00:43:00 -0600
Subject: [PATCH 010/200] first iteration of scalar type change

---
 crypto/bls.go                      |  58 ++++++--------
 crypto/bls12381_utils.c            | 121 +++++++++++++++++------------
 crypto/bls12381_utils.go           |  65 ++++++++--------
 crypto/bls12381_utils.h            |  39 ++++++----
 crypto/bls12381_utils_test.go      |  19 +++--
 crypto/bls_core.c                  |   7 +-
 crypto/bls_include.h               |   2 +-
 crypto/bls_multisig.go             |  43 +++++-----
 crypto/bls_test.go                 |  18 +++--
 crypto/bls_thresholdsign.go        |  24 +++---
 crypto/bls_thresholdsign_core.c    |   2 +-
 crypto/bls_thresholdsign_include.h |   2 +-
 crypto/bls_thresholdsign_test.go   |   2 +
 crypto/blst_include.h              |  11 ++-
 crypto/dkg.go                      |   5 +-
 crypto/dkg_core.c                  |   6 +-
 crypto/dkg_feldmanvss.go           |  26 ++++---
 crypto/dkg_feldmanvssq.go          |  17 ++--
 crypto/dkg_include.h               |   4 +-
 crypto/dkg_jointfeldman.go         |  11 +--
 crypto/dkg_test.go                 |   3 +-
 crypto/thresholdsign.go            |   8 +-
 22 files changed, 269 insertions(+), 224 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 6786f00c4d5..f062fa50f5a 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -166,12 +166,9 @@ func (sk *prKeyBLSBLS12381) Sign(data []byte, kmac hash.Hasher) (Signature, erro
 	// hash the input to 128 bytes
 	h := kmac.ComputeHash(data)
 
-	// set BLS context
-	blsInstance.reInit()
-
 	s := make([]byte, SignatureLenBLSBLS12381)
 	C.bls_sign((*C.uchar)(&s[0]),
-		(*C.bn_st)(&sk.scalar),
+		(*C.Fr)(&sk.scalar),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)))
 	return s, nil
@@ -203,9 +200,6 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 		return false, err
 	}
 
-	// intialize BLS context
-	blsInstance.reInit()
-
 	if len(s) != signatureLengthBLSBLS12381 {
 		return false, nil
 	}
@@ -292,18 +286,24 @@ func BLSInvalidSignature() Signature {
 // decodePrivateKey decodes a slice of bytes into a private key.
 // It checks the scalar is non-zero and is less than the group order.
 func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) {
-	if len(privateKeyBytes) != prKeyLengthBLSBLS12381 {
-		return nil, invalidInputsErrorf("input length must be %d, got %d",
-			prKeyLengthBLSBLS12381, len(privateKeyBytes))
-	}
 	sk := newPrKeyBLSBLS12381(nil)
 
-	readScalar(&sk.scalar, privateKeyBytes)
-	if C.check_membership_Zr_star((*C.bn_st)(&sk.scalar)) == valid {
+	read := C.Fr_star_read_bytes(
+		(*C.Fr)(&sk.scalar),
+		(*C.uchar)(&privateKeyBytes[0]),
+		(C.int)(prKeyLengthBLSBLS12381))
+
+	switch int(read) {
+	case blst_valid:
 		return sk, nil
+	case blst_bad_encoding:
+		return nil, invalidInputsErrorf("input length must be %d, got %d",
+			prKeyLengthBLSBLS12381, len(privateKeyBytes))
+	case blst_bad_scalar:
+		return nil, invalidInputsErrorf("the private key is not in the correct range for the BLS12-381 curve")
+	default:
+		return nil, invalidInputsErrorf("reading the private key failed")
 	}
-
-	return nil, invalidInputsErrorf("the private key is not a valid BLS12-381 curve key")
 }
 
 // decodePublicKey decodes a slice of bytes into a public key.
@@ -356,16 +356,13 @@ type prKeyBLSBLS12381 struct {
 // If no scalar is provided, the function allocates an
 // empty scalar.
 func newPrKeyBLSBLS12381(x *scalar) *prKeyBLSBLS12381 {
-	var sk prKeyBLSBLS12381
-	if x == nil {
-		// initialize the scalar
-		C.bn_new_wrapper((*C.bn_st)(&sk.scalar))
-	} else {
-		// set the scalar
-		sk.scalar = *x
+	if x != nil {
+		return &prKeyBLSBLS12381{
+			// the embedded public key is only computed when needed
+			scalar: *x,
+		}
 	}
-	// the embedded public key is only computed when needed
-	return &sk
+	return &prKeyBLSBLS12381{}
 }
 
 // Algorithm returns the Signing Algorithm
@@ -415,7 +412,7 @@ func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool {
 	if !ok {
 		return false
 	}
-	return sk.scalar.equals(&otherBLS.scalar)
+	return (&sk.scalar).equals(&otherBLS.scalar)
 }
 
 // String returns the hex string representation of the key.
@@ -520,15 +517,6 @@ func (a *blsBLS12381Algo) init() error {
 	return nil
 }
 
-// set the context of BLS 12-381 curve in the lower C and Relic layers assuming the context
-// was previously initialized with a call to init().
-//
-// If the implementation evolves to support multiple contexts,
-// reinit should be called at every blsBLS12381Algo operation.
-func (a *blsBLS12381Algo) reInit() {
-	a.context.setContext()
-}
-
 // This is only a TEST/DEBUG/BENCH function.
 // It returns the hash to G1 point from a slice of 128 bytes
 func mapToG1(data []byte) *pointG1 {
@@ -556,7 +544,7 @@ func (sk *prKeyBLSBLS12381) signWithXMDSHA256(data []byte) Signature {
 	// sign the hash
 	s := make([]byte, SignatureLenBLSBLS12381)
 	C.bls_sign((*C.uchar)(&s[0]),
-		(*C.bn_st)(&sk.scalar),
+		(*C.Fr)(&sk.scalar),
 		(*C.uchar)(&hash[0]),
 		(C.int)(len(hash)))
 	return s
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index f8af1b0f073..fea72d33075 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -8,6 +8,16 @@
 #include "bls_include.h"
 #include "assert.h"
 
+// TODO: temp utility function to delete
+bn_st* Fr_blst_to_relic(const Fr* x) {
+    bn_st* out = (bn_st*)malloc(sizeof(bn_st)); 
+    byte* data = (byte*)malloc(Fr_BYTES);
+    be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES);
+    bn_read_bin(out, data, Fr_BYTES);
+    free(data);
+    return out;
+}
+
 // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
 
 // return macro values to the upper Go Layer
@@ -120,33 +130,37 @@ void seed_relic(byte* seed, int len) {
 }
 
 // Exponentiation of a generic point p in G1
-void ep_mult(ep_t res, const ep_t p, const bn_t expo) {
+void ep_mult(ep_t res, const ep_t p, const Fr *expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using window NAF of size 2 
-    ep_mul_lwnaf(res, p, expo);
+    ep_mul_lwnaf(res, p, tmp_expo);
 }
 
 // Exponentiation of generator g1 in G1
 // These two function are here for bench purposes only
-void ep_mult_gen_bench(ep_t res, const bn_t expo) {
+void ep_mult_gen_bench(ep_t res, const Fr* expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using precomputed table of size 4
-    ep_mul_gen(res, (bn_st *)expo);
+    ep_mul_gen(res, tmp_expo);
 }
 
-void ep_mult_generic_bench(ep_t res, const bn_t expo) {
+void ep_mult_generic_bench(ep_t res, const Fr* expo) {
     // generic point multiplication
     ep_mult(res, &core_get()->ep_g, expo);
 }
 
 // Exponentiation of a generic point p in G2
-void ep2_mult(ep2_t res, ep2_t p, bn_t expo) {
+void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using window NAF of size 2
-    ep2_mul_lwnaf(res, p, expo);
+    ep2_mul_lwnaf(res, p, tmp_expo);
 }
 
-// Exponentiation of fixed g2 in G2
-void ep2_mult_gen(ep2_t res, const bn_t expo) {
+// Exponentiation of generator g2 in G2
+void ep2_mult_gen(ep2_t res, const Fr* expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using precomputed table of size 4
-    g2_mul_gen(res, (bn_st*)expo);
+    g2_mul_gen(res, (bn_st*)tmp_expo);
 }
 
 // DEBUG printing functions 
@@ -183,7 +197,7 @@ void ep2_print_(char* s, ep2_st* p) {
 }
 
 // generates a random number less than the order r
-void bn_randZr_star(bn_t x) {
+void bn_randZr_star(Fr* x) {
     // reduce the modular reduction bias
     const int seed_len = BITS_TO_BYTES(Fr_BITS + SEC_BITS);
     byte seed[seed_len];
@@ -192,33 +206,16 @@ void bn_randZr_star(bn_t x) {
 }
 
 // generates a random number less than the order r
-void bn_randZr(bn_t x) {
-    bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
-    // reduce the modular reduction bias
-    bn_new_size(x, BITS_TO_DIGITS(Fr_BITS + SEC_BITS));
-    bn_rand(x, RLC_POS, Fr_BITS + SEC_BITS);
-    bn_mod(x, x, r);
-    bn_free(r);
+void bn_randZr(Fr* x) {
+    // TODO: SEC_BITS bias reduction
 }
 
-// reads a scalar from an array and maps it to Zr
+// reads a scalar from an array and maps it to Fr
 // the resulting scalar is in the range 0 < a < r
 // len must be less than BITS_TO_BYTES(RLC_BN_BITS)
-void bn_map_to_Zr_star(bn_t a, const uint8_t* bin, int len) {
-    bn_t tmp;
-    bn_new(tmp);
-    bn_new_size(tmp, BYTES_TO_DIGITS(len));
-    bn_read_bin(tmp, bin, len);
-    bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
-    bn_sub_dig(r,r,1);
-    bn_mod_basic(a,tmp,r);
-    bn_add_dig(a,a,1);
-    bn_free(r);
-    bn_free(tmp);
+void bn_map_to_Zr_star(Fr* a, const uint8_t* bin, int len) {
+    // TODO:
+    // a = bin % (r-1)  + 1 
 }
 
 // returns the sign of y.
@@ -523,26 +520,50 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
     return RLC_ERR;
 }
 
-// reads a scalar in a and checks it is a valid Zr element (a < r)
-// returns RLC_OK if the scalar is valid and RLC_ERR otherwise.
-int bn_read_Zr_bin(bn_t a, const uint8_t *bin, int len) {
-    if (len!=Fr_BYTES) {
-        return RLC_ERR;
+bool_t Fr_is_zero(const Fr* a) {
+    return bytes_are_zero((const byte*)a, Fr_BYTES);
+}
+
+bool_t Fr_is_equal(const Fr* a, const Fr* b) {
+    return vec_is_equal(a, b, Fr_BYTES);
+}
+
+// reads a scalar in `a` and checks it is a valid Fr element (a < r).
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fr
+//    - v if the scalar is valid 
+BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
+    if (len != Fr_BYTES) {
+        return BLST_BAD_ENCODING;
     }
-    bn_read_bin(a, bin, Fr_BYTES);
-    bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
-    if (bn_cmp(a, r) == RLC_LT) {
-        return RLC_OK;
+    if (!check_mod_256(bin, BLS12_381_r)) { // check_mod_256 compares byte[] against a vec256!
+        return BLST_BAD_SCALAR;
     }
-    return RLC_ERR;
+    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
+    return BLST_SUCCESS;
+}
+
+// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
+// returns
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
+//    - BLST_SUCCESS if the scalar is valid 
+BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
+    int ret = Fr_read_bytes(a, bin, len);
+    if (ret != BLST_SUCCESS) {
+        return ret;
+    }
+    // check if a=0
+    if (Fr_is_zero(a)) {
+        return BLST_BAD_SCALAR;
+    }
+    return BLST_SUCCESS;
 }
 
 // computes the sum of the array elements x and writes the sum in jointx
-// the sum is computed in Zr
-void bn_sum_vector(bn_t jointx, const bn_st* x, const int len) {
-    bn_t r;
+// the sum is computed in Fr
+void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) {
+    /*bn_t r;
     bn_new(r); 
     g2_get_ord(r);
     bn_set_dig(jointx, 0);
@@ -552,7 +573,7 @@ void bn_sum_vector(bn_t jointx, const bn_st* x, const int len) {
         if (bn_cmp(jointx, r) == RLC_GT) 
             bn_sub(jointx, jointx, r);
     }
-    bn_free(r);
+    bn_free(r);*/
 }
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index fa931cffab6..8c6f1277842 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -17,11 +17,16 @@ import (
 	"errors"
 )
 
+// Go wrappers around BLST C types
 // Go wrappers around Relic C types
-// Relic is compiled with ALLOC=AUTO
 type pointG1 C.ep_st
 type pointG2 C.ep2_st
-type scalar C.bn_st
+type scalar C.Fr
+
+// TODO: For now scalars are represented as field elements Fr since all scalars
+// are less than r - check if distinguishing two types in necessary
+//type pointG1_blst C.G1
+//type pointG2_blst C.G2
 
 // context required for the BLS set-up
 type ctx struct {
@@ -34,6 +39,12 @@ type ctx struct {
 var valid = C.get_valid()
 var invalid = C.get_invalid()
 
+// get some constants from the C layer
+// var blst_errors = C.blst_get_errors()
+var blst_valid = (int)(C.BLST_SUCCESS)             //int(blst_errors[0])
+var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) // int(blst_errors[0])
+var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR)     // int(blst_errors[0])
+
 // initContext sets relic B12_381 parameters and precomputes some data in the C layer
 func (ct *ctx) initContext() error {
 	c := C.relic_init_BLS12_381()
@@ -62,39 +73,32 @@ func seedRelic(seed []byte) error {
 	return nil
 }
 
-// setContext sets the context (previously initialized) of the C layer with
-// pre-saved data.
-func (ct *ctx) setContext() {
-	C.core_set(ct.relicCtx)
-	C.precomputed_data_set(ct.precCtx)
-}
-
 // Exponentiation in G1 (scalar point multiplication)
 func (p *pointG1) scalarMultG1(res *pointG1, expo *scalar) {
-	C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.bn_st)(expo))
+	C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo))
 }
 
 // This function is for TEST only
 // Exponentiation of g1 in G1
 func generatorScalarMultG1(res *pointG1, expo *scalar) {
-	C.ep_mult_gen_bench((*C.ep_st)(res), (*C.bn_st)(expo))
+	C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo))
 }
 
 // This function is for TEST only
 // Generic Exponentiation G1
 func genericScalarMultG1(res *pointG1, expo *scalar) {
-	C.ep_mult_generic_bench((*C.ep_st)(res), (*C.bn_st)(expo))
+	C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo))
 }
 
 // Exponentiation of g2 in G2
 func generatorScalarMultG2(res *pointG2, expo *scalar) {
-	C.ep2_mult_gen((*C.ep2_st)(res), (*C.bn_st)(expo))
+	C.ep2_mult_gen((*C.ep2_st)(res), (*C.Fr)(expo))
 }
 
-// comparison in Zr where r is the group order of G1/G2
+// comparison in Fr where r is the group order of G1/G2
 // (both scalars should be reduced mod r)
 func (x *scalar) equals(other *scalar) bool {
-	return C.bn_cmp((*C.bn_st)(x), (*C.bn_st)(other)) == valid
+	return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0
 }
 
 // comparison in G2
@@ -102,10 +106,10 @@ func (p *pointG2) equals(other *pointG2) bool {
 	return C.ep2_cmp((*C.ep2_st)(p), (*C.ep2_st)(other)) == valid
 }
 
-// Comparison to zero in Zr.
+// Comparison to zero in Fr.
 // Scalar must be already reduced modulo r
 func (x *scalar) isZero() bool {
-	return C.bn_is_zero((*C.bn_st)(x)) == 1
+	return C.Fr_is_zero((*C.Fr)(x)) != 0
 }
 
 // Comparison to point at infinity in G2.
@@ -113,17 +117,17 @@ func (p *pointG2) isInfinity() bool {
 	return C.ep2_is_infty((*C.ep2_st)(p)) == 1
 }
 
-// returns a random number in Zr
+// returns a random number in Fr
 func randZr(x *scalar) {
-	C.bn_randZr((*C.bn_st)(x))
+	//C.bn_randZr((*C.Fr)(x))
 }
 
-// returns a random non-zero number in Zr
+// returns a random non-zero number in Fr
 func randZrStar(x *scalar) {
-	C.bn_randZr_star((*C.bn_st)(x))
+	//C.bn_randZr_star((*C.Fr)(x))
 }
 
-// mapToZrStar reads a scalar from a slice of bytes and maps it to Zr
+// mapToZrStar reads a scalar from a slice of bytes and maps it to Fr
 // the resulting scalar is in the range 0 < k < r
 func mapToZrStar(x *scalar, src []byte) error {
 	if len(src) > maxScalarSize {
@@ -131,7 +135,7 @@ func mapToZrStar(x *scalar, src []byte) error {
 			"input slice length must be less than %d",
 			maxScalarSize)
 	}
-	C.bn_map_to_Zr_star((*C.bn_st)(x),
+	C.bn_map_to_Zr_star((*C.Fr)(x),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
 	return nil
@@ -139,18 +143,11 @@ func mapToZrStar(x *scalar, src []byte) error {
 
 // writeScalar writes a G2 point in a slice of bytes
 func writeScalar(dest []byte, x *scalar) {
-	C.bn_write_bin((*C.uchar)(&dest[0]),
+	/*C.bn_write_bin((*C.uchar)(&dest[0]),
 		(C.int)(prKeyLengthBLSBLS12381),
-		(*C.bn_st)(x),
-	)
-}
-
-// readScalar reads a scalar from a slice of bytes
-func readScalar(x *scalar, src []byte) {
-	C.bn_read_bin((*C.bn_st)(x),
-		(*C.uchar)(&src[0]),
-		(C.int)(len(src)),
-	)
+		(*C.Fr)(x),
+	)*/
+	// TODO: to fill
 }
 
 // writePointG2 writes a G2 point in a slice of bytes
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index d6978d6188d..06bb81332fe 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -17,16 +17,18 @@
 #define BITS_TO_BYTES(x) ((x+7)>>3)
 #define BITS_TO_DIGITS(x) ((x+63)>>6)
 #define BYTES_TO_DIGITS(x) ((x+7)>>3)
+#define DIGITS_TO_BYTES(x) ((x)<<3)
 #define MIN(a,b) ((a)>(b)?(b):(a))
 
 // Fields and Group serialization lengths
 #define SEC_BITS  128
 #define Fp_BITS   381
-#define Fr_BITS   255
-#define Fp_BYTES  BITS_TO_BYTES(Fp_BITS)
 #define Fp2_BYTES (2*Fp_BYTES)
 #define Fp_DIGITS BITS_TO_DIGITS(Fp_BITS)
-#define Fr_BYTES  BITS_TO_BYTES(Fr_BITS)
+#define Fp_BYTES  DIGITS_TO_BYTES(Fp_DIGITS) // BLST implements Fp as a limb array
+#define Fr_BITS   255
+#define Fr_DIGITS BITS_TO_DIGITS(Fr_BITS)
+#define Fr_BYTES  DIGITS_TO_BYTES(Fr_DIGITS) // BLST implements Fr as a limb array
 
 #define G1_BYTES (2*Fp_BYTES)
 #define G2_BYTES (2*Fp2_BYTES)
@@ -76,12 +78,19 @@ typedef struct prec_ {
     fp_t r;   // Montgomery multiplication constant
 } prec_st;
 
+// TODO: to delete when Relic is removed
+bn_st* Fr_blst_to_relic(const Fr* x);
+
 // BLS based SPoCK
 int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
 
 // hash to curve functions (functions in bls12381_hashtocurve.c)
 void     map_to_G1(ep_t, const byte*, const int);
 
+// Fr utilities
+bool_t Fr_is_zero(const Fr* a);
+bool_t Fr_is_equal(const Fr* a, const Fr* b);
+
 // Utility functions
 int      get_valid();
 int      get_invalid();
@@ -96,18 +105,22 @@ int      ep_read_bin_compact(ep_t, const byte *, const int);
 void     ep_write_bin_compact(byte *, const ep_t,  const int);
 int      ep2_read_bin_compact(ep2_t, const byte *,  const int);
 void     ep2_write_bin_compact(byte *, const ep2_t,  const int);
-int      bn_read_Zr_bin(bn_t, const uint8_t *, int );
+BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
+BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
+
+
 
-void     ep_mult_gen_bench(ep_t, const bn_t);
-void     ep_mult_generic_bench(ep_t, const bn_t);
-void     ep_mult(ep_t, const ep_t, const bn_t);
-void     ep2_mult_gen(ep2_t, const bn_t);
+void     ep_mult_gen_bench(ep_t, const Fr*);
+void     ep_mult_generic_bench(ep_t, const Fr*);
+void     ep_mult(ep_t, const ep_t, const Fr*);
+void     ep2_mult_gen(ep2_t, const Fr*);
+void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
 
-void     bn_randZr(bn_t);
-void     bn_randZr_star(bn_t);
-void     bn_map_to_Zr_star(bn_t, const uint8_t*, int);
+void     bn_randZr(Fr*);
+void     bn_randZr_star(Fr*);
+void     bn_map_to_Zr_star(Fr*, const uint8_t*, int);
 
-void     bn_sum_vector(bn_t, const bn_st*, const int);
+void     Fr_sum_vector(Fr*, const Fr*, const int);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 void     ep2_sum_vector(ep2_t, ep2_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
@@ -116,7 +129,7 @@ void     ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len);
 // membership checks
 int      check_membership_G1(const ep_t);
 int      check_membership_G2(const ep2_t);
-int      check_membership_Zr_star(const bn_t);
+int      check_membership_Fr_star(const bn_t);
 
 int      simple_subgroup_check_G1(const ep_t);
 int      simple_subgroup_check_G2(const ep2_t);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 8911ada1769..ce8f6d9df09 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -26,8 +26,8 @@ func TestDeterministicKeyGen(t *testing.T) {
 }
 
 // test the deterministicity of the relic PRG (used by the DKG polynomials)
-func TestPRGseeding(t *testing.T) {
-	blsInstance.reInit()
+/*func TestPRGseeding(t *testing.T) {
+
 	// 2 scalars generated with the same seed should be equal
 	seed := make([]byte, KeyGenSeedMinLenBLSBLS12381)
 	n, err := rand.Read(seed)
@@ -37,24 +37,24 @@ func TestPRGseeding(t *testing.T) {
 	err = seedRelic(seed)
 	require.Nil(t, err)
 	var sk1 prKeyBLSBLS12381
-	randZr(&sk1.scalar)
+	randZr(sk1.scalar)
 	// 2nd scalar (wrapped in a private key)
 	err = seedRelic(seed)
 	require.Nil(t, err)
 	var sk2 prKeyBLSBLS12381
-	randZr(&sk2.scalar)
+	randZr(sk2.scalar)
 	// compare the 2 scalars (by comparing the private keys)
 	assert.True(t, sk1.Equals(&sk2), "private keys should be equal")
-}
+}*/
 
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMultG1G2(b *testing.B) {
-	blsInstance.reInit()
+
 	seed := make([]byte, securityBits/8)
 	_, _ = rand.Read(seed)
 	_ = seedRelic(seed)
 	var expo scalar
-	randZr(&expo)
+	randZr(&expo) // TODO: upadate
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
@@ -122,7 +122,7 @@ func TestMapToG1(t *testing.T) {
 
 // Hashing to G1 bench
 func BenchmarkMapToG1(b *testing.B) {
-	blsInstance.reInit()
+
 	input := make([]byte, expandMsgOutput)
 	for i := 0; i < len(input); i++ {
 		input[i] = byte(i)
@@ -136,7 +136,7 @@ func BenchmarkMapToG1(b *testing.B) {
 
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
-	blsInstance.reInit()
+
 	// seed Relic PRG
 	seed := make([]byte, securityBits/8)
 	_, _ = rand.Read(seed)
@@ -165,7 +165,6 @@ func TestSubgroupCheck(t *testing.T) {
 
 // subgroup membership check bench
 func BenchmarkSubgroupCheck(b *testing.B) {
-	blsInstance.reInit()
 
 	b.Run("G1", func(b *testing.B) {
 		var p pointG1
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 7cb8a04aef6..cdfc6aaf7f1 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -21,7 +21,7 @@ int get_sk_len() {
 
 // checks an input scalar a satisfies 0 < a < r
 // where (r) is the order of G1/G2
-int check_membership_Zr_star(const bn_t a){
+int check_membership_Fr_star(const bn_t a){
     if (bn_cmp(a, &core_get()->ep_r) != RLC_LT || bn_cmp_dig(a, 0) != RLC_GT) {
         return INVALID; 
     }
@@ -68,9 +68,10 @@ int check_membership_G2(const ep2_t p){
 }
 
 // Computes a BLS signature from a G1 point 
-static void bls_sign_ep(byte* s, const bn_t sk, const ep_t h) {
+static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) {
     ep_t p;
     ep_new(p);
+
     // s = h^sk
     ep_mult(p, h, sk);
     ep_write_bin_compact(s, p, SIGNATURE_LEN);
@@ -78,7 +79,7 @@ static void bls_sign_ep(byte* s, const bn_t sk, const ep_t h) {
 }
 
 // Computes a BLS signature from a hash
-void bls_sign(byte* s, const bn_t sk, const byte* data, const int len) {
+void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
     ep_t h;
     ep_new(h);
     // hash to G1
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 016845719e1..325203479b2 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -35,7 +35,7 @@ int      get_signature_len();
 int      get_pk_len();
 int      get_sk_len();  
 
-void     bls_sign(byte*, const bn_t, const byte*, const int);
+void     bls_sign(byte*, const Fr*, const byte*, const int);
 int      bls_verify(const ep2_t, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
                          const uint32_t*, const ep2_st*);
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index a915bed4a64..297e61267d9 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -5,9 +5,12 @@ package crypto
 
 import (
 	"errors"
-	"fmt"
 
-	"github.com/onflow/flow-go/crypto/hash"
+	_ "errors"
+
+	_ "fmt"
+
+	_ "github.com/onflow/flow-go/crypto/hash"
 )
 
 // BLS multi-signature using BLS12-381 curve
@@ -38,6 +41,7 @@ import "C"
 // used for signatures.
 var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite)
 
+/*
 // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key.
 //
 // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used
@@ -92,8 +96,8 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) {
 //   - (nil, error) if an unexpected error occurs
 //   - (aggregated_signature, nil) otherwise
 func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	// check for empty list
 	if len(sigs) == 0 {
@@ -139,8 +143,8 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 //   - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty)
 //   - (aggregated_key, nil) otherwise
 func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	// check for empty list
 	if len(keys) == 0 {
@@ -157,8 +161,7 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
 	}
 
 	var sum scalar
-	C.bn_new_wrapper((*C.bn_st)(&sum))
-	C.bn_sum_vector((*C.bn_st)(&sum), (*C.bn_st)(&scalars[0]),
+	C.Fr_sum_vector((*C.Fr)(&sum), (*C.Fr)(&scalars[0]),
 		(C.int)(len(scalars)))
 	return newPrKeyBLSBLS12381(&sum), nil
 }
@@ -177,8 +180,8 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
 //   - (nil, blsAggregateEmptyListError) no keys are provided (input slice is empty)
 //   - (aggregated_key, nil) otherwise
 func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	// check for empty list
 	if len(keys) == 0 {
@@ -200,13 +203,12 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
 	return sumKey, nil
-}
+}*/
 
 // IdentityBLSPublicKey returns an identity public key which corresponds to the point
 // at infinity in G2 (identity element of G2).
+// TODO: return a constant key instead of a newly allocated one
 func IdentityBLSPublicKey() PublicKey {
-	// set BLS context
-	blsInstance.reInit()
 
 	identity := *newPubKeyBLSBLS12381(nil)
 	// set the point to infinity
@@ -215,6 +217,8 @@ func IdentityBLSPublicKey() PublicKey {
 	return &identity
 }
 
+/*
+
 // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key.
 //
 // The common use case assumes the aggregated public key was initially formed using
@@ -230,8 +234,8 @@ func IdentityBLSPublicKey() PublicKey {
 //   - (nil, notBLSKeyError) if at least one input key is not of type BLS BLS12-381
 //   - (remaining_key, nil) otherwise
 func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381)
 	if !ok {
@@ -330,8 +334,8 @@ func VerifyBLSSignatureOneMessage(
 func VerifyBLSSignatureManyMessages(
 	pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher,
 ) (bool, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	// check signature length
 	if len(s) != signatureLengthBLSBLS12381 {
@@ -479,8 +483,8 @@ func VerifyBLSSignatureManyMessages(
 func BatchVerifyBLSSignaturesOneMessage(
 	pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher,
 ) ([]bool, error) {
-	// set BLS context
-	blsInstance.reInit()
+
+
 
 	// empty list check
 	if len(pks) == 0 {
@@ -545,6 +549,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 
 	return verifBool, nil
 }
+*/
 
 // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys)
 // is empty or nil and thereby represents an invalid input.
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index d0dc73c066c..df0afe1e96d 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -7,9 +7,9 @@ import (
 	"crypto/rand"
 	"encoding/hex"
 	"fmt"
-	mrand "math/rand"
+	_ "math/rand"
 	"testing"
-	"time"
+	_ "time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -131,7 +131,7 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
 	})
 
-	t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
+	/*t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
 		data := []byte("random_data")
 		// empty tag hasher
 		sigKmac := NewExpandMsgXOFKMAC128("")
@@ -140,7 +140,7 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		// PoP hasher
 		h2 := popKMAC.ComputeHash(data)
 		assert.NotEqual(t, h1, h2)
-	})
+	})*/
 
 }
 
@@ -216,7 +216,7 @@ func TestBLSUtils(t *testing.T) {
 }
 
 // BLS Proof of Possession test
-func TestBLSPOP(t *testing.T) {
+/*func TestBLSPOP(t *testing.T) {
 	r := time.Now().UnixNano()
 	mrand.Seed(r)
 	t.Logf("math rand seed is %d", r)
@@ -268,6 +268,8 @@ func TestBLSPOP(t *testing.T) {
 	})
 }
 
+
+
 // BLS multi-signature
 // signature aggregation sanity check
 //
@@ -934,7 +936,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		assert.False(t, valid, "verification should fail with nil hasher")
 		inputPks[0] = tmpPK
 	})
-}
+}*/
 
 // TestBLSErrorTypes verifies working of error-type-detecting functions
 // such as `IsInvalidInputsError`.
@@ -962,6 +964,7 @@ func TestBLSErrorTypes(t *testing.T) {
 	})
 }
 
+/*
 // VerifyBLSSignatureManyMessages bench
 // Bench the slowest case where all messages and public keys are distinct.
 // (2*n) pairings without aggrgetion Vs (n+1) pairings with aggregation.
@@ -1057,6 +1060,7 @@ func BenchmarkAggregate(b *testing.B) {
 	})
 }
 
+
 func TestBLSIdentity(t *testing.T) {
 	r := time.Now().UnixNano()
 	mrand.Seed(r)
@@ -1109,4 +1113,4 @@ func TestBLSIdentity(t *testing.T) {
 		assert.NoError(t, err)
 		assert.False(t, valid)
 	})
-}
+}*/
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 4aa73278d3a..df3da1a108d 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -5,7 +5,7 @@ package crypto
 
 // #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
-import "C"
+/*import "C"
 
 import (
 	"fmt"
@@ -412,8 +412,8 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 		signers = append(signers, index)
 	}
 
-	// set BLS settings
-	blsInstance.reInit()
+
+
 
 	// Lagrange Interpolate at point 0
 	result := C.G1_lagrangeInterpolateAtZero(
@@ -456,8 +456,8 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 // are considered to reconstruct the signature.
 func BLSReconstructThresholdSignature(size int, threshold int,
 	shares []Signature, signers []int) (Signature, error) {
-	// set BLS settings
-	blsInstance.reInit()
+
+
 
 	if size < ThresholdSignMinSize || size > ThresholdSignMaxSize {
 		return nil, invalidInputsErrorf(
@@ -558,8 +558,8 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 			threshold)
 	}
 
-	// set BLS settings
-	blsInstance.reInit()
+
+
 
 	// the scalars x and G2 points y
 	x := make([]scalar, size)
@@ -570,7 +570,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	if err := seedRelic(seed); err != nil {
 		return nil, nil, nil, fmt.Errorf("seeding relic failed: %w", err)
 	}
-	// Generate a polynomial P in Zr[X] of degree t
+	// Generate a polynomial P in Fr[X] of degree t
 	a := make([]scalar, threshold+1)
 	randZrStar(&a[0]) // non-identity key
 	if threshold > 0 {
@@ -581,10 +581,10 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	}
 	// compute the shares
 	for i := index(1); int(i) <= size; i++ {
-		C.Zr_polynomialImage(
-			(*C.bn_st)(&x[i-1]),
+		C.Fr_polynomialImage(
+			(*C.Fr)(&x[i-1]),
 			(*C.ep2_st)(&y[i-1]),
-			(*C.bn_st)(&a[0]), (C.int)(len(a)),
+			(*C.Fr)(&a[0]), (C.int)(len(a)),
 			(C.uint8_t)(i),
 		)
 	}
@@ -604,4 +604,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	// are sampled uniformly at random. The probability of
 	// generating an identity key is therefore negligible.
 	return skShares, pkShares, pkGroup, nil
-}
+}*/
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index dc57355df47..94a12a024d7 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -9,7 +9,7 @@ static void Zr_lagrangeCoefficientAtZero(bn_t res, const int i, const uint8_t* s
     bn_t r, r_2;
     bn_new(r);
     g2_get_ord(r);
-    // (r-2) is needed to compute the inverse in Zr
+    // (r-2) is needed to compute the inverse in Fr
     // using little Fermat theorem
     bn_new(r_2);
     bn_sub_dig(r_2, r, 2);
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 7471e1a0a3d..9b3a700fc96 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -10,6 +10,6 @@
 #define MAX_IND_LOOPS   32 
 
 int G1_lagrangeInterpolateAtZero(byte*, const byte* , const uint8_t*, const int);
-extern void Zr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+extern void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index cc9be81eeaf..7f0802d57b9 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"crypto/rand"
 	"fmt"
@@ -615,3 +616,4 @@ func BenchmarkSimpleKeyGen(b *testing.B) {
 	}
 	b.StopTimer()
 }
+*/
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index dde3acd5f05..2721edcd97a 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -6,6 +6,15 @@
 // blst related definitions
 // eventually this file would replace blst.h
 
-#include "blst.h"
+#include "blst.h" // TODO: should be deleted
+#include "point.h"
+#include "consts.h"
+
+// field elements F_r
+typedef struct {limb_t limbs[4];} Fr; // also used as vec256;
+// Subroup G1 in E1
+typedef POINTonE1 G1;
+// Subroup G1 in E2
+typedef POINTonE2 G2;
 
 #endif
\ No newline at end of file
diff --git a/crypto/dkg.go b/crypto/dkg.go
index 6e74f3d54a5..3e369b77fa4 100644
--- a/crypto/dkg.go
+++ b/crypto/dkg.go
@@ -1,5 +1,6 @@
 package crypto
 
+/*
 import (
 	"errors"
 	"fmt"
@@ -22,7 +23,7 @@ import (
 // Flow uses DKG with the value t = floor((n-1)/2) to optimize for unforgeability and robustness
 // of the threshold signature scheme using the output keys.
 //
-// Private keys are scalar in Zr, where r is the group order of G1/G2.
+// Private keys are scalar in Fr, where r is the group order of G1/G2.
 // Public keys are in G2.
 
 const (
@@ -234,4 +235,4 @@ type DKGProcessor interface {
 	// do so, the protocol can be broken.
 	// log describes the misbehavior.
 	FlagMisbehavior(participant int, log string)
-}
+}*/
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 3a2bce01559..50923ee9087 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -11,10 +11,10 @@
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
-void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){
+void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){
     bn_t image;
     bn_new(image);
-    Zr_polynomialImage(image, y, a, a_size, x);
+    Fr_polynomialImage(image, y, a, a_size, x);
     // exports the result
     const int out_size = Fr_BYTES;
     bn_write_bin(out, out_size, image);
@@ -25,7 +25,7 @@ void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_s
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
-void Zr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){
+void Fr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){
     bn_t r;
     bn_new(r); 
     g2_get_ord(r);
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 175fe5acb0a..76a5aebcd49 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -21,7 +22,7 @@ import (
 // partcipants including itself. The particpants validate their shares
 // using a public verifiaction vector shared by the .
 
-// Private keys are scalar in Zr, where r is the group order of G1/G2
+// Private keys are scalar in Fr, where r is the group order of G1/G2
 // Public keys are in G2.
 
 // feldman VSS protocol, implements DKGState
@@ -30,7 +31,7 @@ type feldmanVSSstate struct {
 	*dkgCommon
 	// participant  index
 	dealerIndex index
-	// Polynomial P = a_0 + a_1*x + .. + a_t*x^t  in Zr[X], the vector size is (t+1)
+	// Polynomial P = a_0 + a_1*x + .. + a_t*x^t  in Fr[X], the vector size is (t+1)
 	// a_0 is the group private key
 	a []scalar
 	// Public vector of the group, the vector size is (t+1)
@@ -77,12 +78,12 @@ func NewFeldmanVSS(size int, threshold int, myIndex int,
 
 func (s *feldmanVSSstate) init() {
 	// set the bls context
-	blsInstance.reInit()
+
 	s.running = false
 	s.y = nil
 	s.xReceived = false
 	s.vAReceived = false
-	C.bn_new_wrapper((*C.bn_st)(&s.x))
+	C.bn_new_wrapper((*C.Fr)(&s.x))
 }
 
 // Start triggers the protocol start for the current participant.
@@ -264,7 +265,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 		return fmt.Errorf("generating shares failed: %w", err)
 	}
 
-	// Generate a polyomial P in Zr[X] of degree t
+	// Generate a polyomial P in Fr[X] of degree t
 	s.a = make([]scalar, s.threshold+1)
 	s.vA = make([]pointG2, s.threshold+1)
 	s.y = make([]pointG2, s.size)
@@ -273,7 +274,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 	generatorScalarMultG2(&s.vA[0], &s.a[0])
 	if s.threshold > 0 {
 		for i := 1; i < s.threshold; i++ {
-			C.bn_new_wrapper((*C.bn_st)(&s.a[i]))
+			C.bn_new_wrapper((*C.Fr)(&s.a[i]))
 			randZr(&s.a[i])
 			generatorScalarMultG2(&s.vA[i], &s.a[i])
 		}
@@ -288,7 +289,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 		if i-1 == s.myIndex {
 			xdata := make([]byte, shareSize)
 			zrPolynomialImage(xdata, s.a, i, &s.y[i-1])
-			C.bn_read_bin((*C.bn_st)(&s.x),
+			C.bn_read_bin((*C.Fr)(&s.x),
 				(*C.uchar)(&xdata[0]),
 				PrKeyLenBLSBLS12381,
 			)
@@ -350,7 +351,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) {
 	}
 
 	// read the participant private share
-	if C.bn_read_Zr_bin((*C.bn_st)(&s.x),
+	if C.Fr_read_bytes((*C.Fr)(&s.x),
 		(*C.uchar)(&data[0]),
 		PrKeyLenBLSBLS12381,
 	) != valid {
@@ -405,14 +406,14 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 	}
 }
 
-// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Zr
+// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Fr
 // r being the order of G1
 // P(x) is written in dest, while g2^P(x) is written in y
 // x being a small integer
 func zrPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
-	C.Zr_polynomialImage_export((*C.uchar)(&dest[0]),
+	C.Fr_polynomialImage_export((*C.uchar)(&dest[0]),
 		(*C.ep2_st)(y),
-		(*C.bn_st)(&a[0]), (C.int)(len(a)),
+		(*C.Fr)(&a[0]), (C.int)(len(a)),
 		(C.uint8_t)(x),
 	)
 }
@@ -441,7 +442,7 @@ func readVerifVector(A []pointG2, src []byte) error {
 
 func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
-	return C.verifyshare((*C.bn_st)(&s.x),
+	return C.verifyshare((*C.Fr)(&s.x),
 		(*C.ep2_st)(&s.y[s.myIndex])) == 1
 }
 
@@ -455,3 +456,4 @@ func (s *feldmanVSSstate) computePublicKeys() {
 		(*C.ep2_st)(&s.vA[0]), (C.int)(len(s.vA)),
 	)
 }
+*/
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index ecb26f7d6e3..76f343256a4 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -27,7 +28,7 @@ import (
 // a complaint answer. The protocol ends with all honest participants
 // reaching a consensus about the dealer qualification/disqualification.
 
-// Private keys are scalar in Zr, where r is the group order of G1/G2
+// Private keys are scalar in Fr, where r is the group order of G1/G2
 // Public keys are in G2.
 
 // feldman VSS protocol, with complaint mechanism, implements DKGState
@@ -402,7 +403,7 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) {
 		return
 	}
 	// read the participant private share
-	if C.bn_read_Zr_bin((*C.bn_st)(&s.x),
+	if C.Fr_read_bytes((*C.Fr)(&s.x),
 		(*C.uchar)(&data[0]),
 		PrKeyLenBLSBLS12381,
 	) != valid {
@@ -507,7 +508,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 // - true if the complaint answer is not correct
 func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool {
 	// check y[complainer] == share.G2
-	return C.verifyshare((*C.bn_st)(&c.answer),
+	return C.verifyshare((*C.Fr)(&c.answer),
 		(*C.ep2_st)(&s.y[complainer])) == 0
 }
 
@@ -624,8 +625,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 
 		// read the complainer private share
-		C.bn_new_wrapper((*C.bn_st)(&s.complaints[complainer].answer))
-		if C.bn_read_Zr_bin((*C.bn_st)(&s.complaints[complainer].answer),
+		C.bn_new_wrapper((*C.Fr)(&s.complaints[complainer].answer))
+		if C.Fr_read_bytes((*C.Fr)(&s.complaints[complainer].answer),
 			(*C.uchar)(&data[1]),
 			PrKeyLenBLSBLS12381,
 		) != valid {
@@ -648,8 +649,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 	// flag check is a sanity check
 	if c.received {
 		// read the complainer private share
-		C.bn_new_wrapper((*C.bn_st)(&c.answer))
-		if C.bn_read_Zr_bin((*C.bn_st)(&c.answer),
+		C.bn_new_wrapper((*C.Fr)(&c.answer))
+		if C.Fr_read_bytes((*C.Fr)(&c.answer),
 			(*C.uchar)(&data[1]),
 			PrKeyLenBLSBLS12381,
 		) != valid {
@@ -672,4 +673,4 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 			s.x = c.answer
 		}
 	}
-}
+}*/
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 5e518300071..f50b143961d 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -9,8 +9,8 @@
 #define MAX_IND         255
 #define MAX_IND_BITS    8
 
-void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x);
-void Zr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
 void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A);
 void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len);
 int  ep2_vector_read_bin(ep2_st* A, const byte* src, const int len);
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index d79379f7d83..51733e803fb 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
@@ -34,7 +35,7 @@ import (
 // from the protocol, and the overall key is taking into account
 // all chunks from qualified dealers.
 
-// Private keys are scalar in Zr, where r is the group order of G1/G2
+// Private keys are scalar in Fr, where r is the group order of G1/G2
 // Public keys are in G2.
 
 // Joint Feldman protocol, with complaint mechanism, implements DKGState
@@ -202,7 +203,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) {
 	jointx, jointPublicKey, jointy := s.sumUpQualifiedKeys(s.size - disqualifiedTotal)
 
 	// private key of the current participant
-	x := newPrKeyBLSBLS12381(jointx)
+	x := newPrKeyBLSBLS12381(&jointx)
 
 	// Group public key
 	Y := newPubKeyBLSBLS12381(jointPublicKey)
@@ -303,8 +304,8 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2
 
 	// sum up x
 	var jointx scalar
-	C.bn_new_wrapper((*C.bn_st)(&jointx))
-	C.bn_sum_vector((*C.bn_st)(&jointx), (*C.bn_st)(&qualifiedx[0]),
+	C.bn_new_wrapper((*C.Fr)(&jointx))
+	C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]),
 		(C.int)(qualified))
 	// sum up Y
 	var jointPublicKey pointG2
@@ -338,4 +339,4 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2
 		}
 	}
 	return qualifiedx, qualifiedPubKey, qualifiedy
-}
+}*/
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index d996ae0835c..104cb8ef56f 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"fmt"
 	mrand "math/rand"
@@ -833,4 +834,4 @@ func TestDKGTransitionErrors(t *testing.T) {
 			assert.True(t, IsDKGInvalidStateTransitionError(err))
 		}
 	})
-}
+}*/
diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go
index 2dae7061b76..ebb814dee5b 100644
--- a/crypto/thresholdsign.go
+++ b/crypto/thresholdsign.go
@@ -16,10 +16,10 @@ import (
 // the input threshold value (t) should be set to t = floor((n-1)/2).
 
 const (
-	// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol
-	ThresholdSignMinSize = MinimumThreshold + 1
-	// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol
-	ThresholdSignMaxSize = DKGMaxSize
+// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol
+// ThresholdSignMinSize = MinimumThreshold + 1
+// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol
+// ThresholdSignMaxSize = DKGMaxSize
 )
 
 // ThresholdSignatureInspector is an inspector of the threshold signature protocol.

From b99d75bc530c46b9b90ab523c386d6fab65749e7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 18 Feb 2023 20:36:07 -0600
Subject: [PATCH 011/200] use new type Fr in BLS simple sig

---
 crypto/bls.go                 |  5 +-
 crypto/bls12381_utils.c       | 86 ++++++++++++-----------------------
 crypto/bls12381_utils.go      | 46 +++++++++++++------
 crypto/bls12381_utils.h       | 33 +++++++-------
 crypto/bls12381_utils_test.go |  2 +-
 crypto/bls_include.h          |  2 +-
 crypto/bls_test.go            |  4 +-
 crypto/blst_include.h         | 15 +++++-
 crypto/blst_src/README.md     |  5 +-
 crypto/blst_tools.c           | 50 ++++++++++++++++++++
 crypto/dkg_feldmanvss.go      |  2 -
 crypto/dkg_feldmanvssq.go     |  2 -
 crypto/dkg_jointfeldman.go    |  1 -
 13 files changed, 150 insertions(+), 103 deletions(-)
 create mode 100644 crypto/blst_tools.c

diff --git a/crypto/bls.go b/crypto/bls.go
index e4b9d4825b6..8abee2c9200 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -61,7 +61,7 @@ const (
 	//
 	// SignatureLenBLSBLS12381 is the size of G1 elements
 	SignatureLenBLSBLS12381 = fieldSize * (2 - serializationG1) // the length is divided by 2 if compression is on
-	PrKeyLenBLSBLS12381     = 32
+	PrKeyLenBLSBLS12381     = 32                                // equal to frBytesLen
 	// PubKeyLenBLSBLS12381 is the size of G2 elements
 	PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on
 
@@ -271,7 +271,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) {
 
 	// L is the OKM length
 	// L = ceil((3 * ceil(log2(r))) / 16) which makes L (security_bits/8)-larger than r size
-	okmLength := (3 * PrKeyLenBLSBLS12381) / 2
+	okmLength := (3 * frBytesLen) / 2
 
 	// HKDF secret = IKM || I2OSP(0, 1)
 	secret := make([]byte, len(ikm)+1)
@@ -320,6 +320,7 @@ func BLSInvalidSignature() Signature {
 }
 
 // decodePrivateKey decodes a slice of bytes into a private key.
+// Decoding assumes a bytes big endian format.
 // It checks the scalar is non-zero and is less than the group order.
 func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) {
 	sk := newPrKeyBLSBLS12381(nil)
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 28cf52f04a2..d71e583daf0 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -29,8 +29,8 @@ int get_invalid() {
     return INVALID;
 }
 
-void bn_new_wrapper(bn_t a) {
-    bn_new(a);
+int get_Fr_BYTES() {
+    return Fr_BYTES;
 }
 
 // global variable of the pre-computed data
@@ -171,7 +171,15 @@ void bytes_print_(char* s, byte* data, int len) {
     printf("\n");
 }
 
-// DEBUG printing functions 
+void Fr_print_(char* s, Fr* a) {
+    printf("[%s]:\n", s);
+    limb_t* p = (limb_t*)(a) + Fr_DIGITS;
+    for (int i=0; i<Fr_DIGITS; i++) 
+        printf("%16llx", *(--p));
+    printf("\n");
+}
+ 
+
 void fp_print_(char* s, fp_st a) {
     char* str = malloc(sizeof(char) * fp_size_str(a, 16));
     fp_write_str(str, 100, a, 16);
@@ -196,58 +204,11 @@ void ep2_print_(char* s, ep2_st* p) {
     g2_print(p);
 }
 
-// generates a random number less than the order r
-void bn_randZr_star(Fr* x) {
-    // reduce the modular reduction bias
-    const int seed_len = BITS_TO_BYTES(Fr_BITS + SEC_BITS);
-    byte seed[seed_len];
-    rand_bytes(seed, seed_len);
-    bn_map_to_Zr_star(x, seed, seed_len);
-    rand_bytes(seed, seed_len); // overwrite seed
-}
-
-// generates a random number less than the order r
-void bn_randZr(Fr* x) {
-    // reduce the modular reduction bias
-    /*bn_new_size(x, BITS_TO_DIGITS(Fr_BITS + SEC_BITS));
-    bn_rand(x, RLC_POS, Fr_BITS + SEC_BITS);
-    bn_mod(x, x, &core_get()->ep_r);*/
-}
-
-// Reads a scalar from an array and maps it to Zr.
-// The resulting scalar `a` satisfies 0 <= a < r.
-// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS).
-// It returns VALID if scalar is zero and INVALID otherwise
-int bn_map_to_Zr(Fr* a, const uint8_t* bin, int len) {
-    /*bn_t tmp;
-    bn_new(tmp);
-    bn_new_size(tmp, BYTES_TO_DIGITS(len));
-    bn_read_bin(tmp, bin, len);
-    bn_mod(a, tmp, &core_get()->ep_r);
-    bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp
-    bn_free(tmp);
-    if (bn_cmp_dig(a, 0) == RLC_EQ) {
-        return VALID;
-    }
-    return INVALID;*/
-}
-
-// Reads a scalar from an array and maps it to Zr*.
-// The resulting scalar `a` satisfies 0 < a < r.
-// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS)
-void bn_map_to_Zr_star(Fr* a, const uint8_t* bin, int len) {
-    /*bn_t tmp;
-    bn_new(tmp);
-    bn_new_size(tmp, BYTES_TO_DIGITS(len));
-    bn_read_bin(tmp, bin, len);
-    bn_t r_1;
-    bn_new(r_1); 
-    bn_sub_dig(r_1, &core_get()->ep_r, 1);
-    bn_mod_basic(a,tmp,r_1);
-    bn_add_dig(a,a,1);
-    bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp
-    bn_free(tmp);
-    bn_free(r_1);*/
+// Reads a scalar from an array and maps it to Fr.
+// It returns true if scalar is zero and false otherwise.
+bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
+    vec256_from_be_bytes((limb_t*)a, bin, len);
+    return Fr_is_zero(a);
 }
 
 // returns the sign of y.
@@ -561,6 +522,8 @@ bool_t Fr_is_equal(const Fr* a, const Fr* b) {
 }
 
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
+// input bytes are big endian.
+// returns:
 //    - BLST_BAD_ENCODING if the length is invalid
 //    - BLST_BAD_SCALAR if the scalar isn't in Fr
 //    - v if the scalar is valid 
@@ -568,15 +531,19 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
     if (len != Fr_BYTES) {
         return BLST_BAD_ENCODING;
     }
-    if (!check_mod_256(bin, BLS12_381_r)) { // check_mod_256 compares byte[] against a vec256!
+    pow256 tmp;
+    pow256_from_be_bytes(tmp, bin);
+    if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
         return BLST_BAD_SCALAR;
     }
+    vec_zero(tmp, Fr_BYTES);
     limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
     return BLST_SUCCESS;
 }
 
 // reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
-// returns
+// input bytes are big endian.
+// returns:
 //    - BLST_BAD_ENCODING if the length is invalid
 //    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
 //    - BLST_SUCCESS if the scalar is valid 
@@ -592,6 +559,11 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
     return BLST_SUCCESS;
 }
 
+// write Fr element `a` in big endian bytes.
+void Fr_write_bytes(uint8_t *bin, const Fr* a) {
+    be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
+}
+
 // computes the sum of the array elements x and writes the sum in jointx
 // the sum is computed in Fr
 void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) {
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 43e6a782291..d569bf0cc38 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -14,6 +14,7 @@ package crypto
 // #include "bls12381_utils.h"
 import "C"
 import (
+	"crypto/rand"
 	"errors"
 )
 
@@ -23,6 +24,9 @@ type pointG1 C.ep_st
 type pointG2 C.ep2_st
 type scalar C.Fr
 
+// BLS12-381 related lengths
+var frBytesLen = int(C.get_Fr_BYTES())
+
 // TODO: For now scalars are represented as field elements Fr since all scalars
 // are less than r - check if distinguishing two types in necessary
 //type pointG1_blst C.G1
@@ -117,33 +121,47 @@ func (p *pointG2) isInfinity() bool {
 	return C.ep2_is_infty((*C.ep2_st)(p)) == 1
 }
 
-// returns a random number in Fr
-func randZr(x *scalar) {
-	//C.bn_randZr((*C.Fr)(x))
+// returns a random element of Fr in input pointer
+func randZr(x *scalar) error {
+	bytes := make([]byte, frBytesLen+securityBits/8)
+	_, err := rand.Read(bytes) // checking one output is enough
+	if err != nil {
+		return errors.New("internal rng failed")
+	}
+	_ = mapToZr(x, bytes)
+	return nil
 }
 
-// returns a random non-zero number in Fr
-func randZrStar(x *scalar) {
-	//C.bn_randZr_star((*C.Fr)(x))
+// writes a random element of Fr* in input pointer
+func randZrStar(x *scalar) error {
+	bytes := make([]byte, frBytesLen+securityBits/8)
+	isZero := true
+	for isZero {
+		_, err := rand.Read(bytes) // checking one output is enough
+		if err != nil {
+			return errors.New("internal rng failed")
+		}
+		isZero = mapToZr(x, bytes)
+	}
+	return nil
 }
 
 // mapToZr reads a scalar from a slice of bytes and maps it to Zr.
-// The resulting scalar `k` satisfies 0 <= k < r.
+// The resulting element `k` therefore satisfies 0 <= k < r.
 // It returns true if scalar is zero and false otherwise.
 func mapToZr(x *scalar, src []byte) bool {
-	isZero := C.bn_map_to_Zr((*C.Fr)(x),
+	isZero := C.map_bytes_to_Fr((*C.Fr)(x),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
-	return isZero == valid
+	if isZero {
+		return true
+	}
+	return false
 }
 
 // writeScalar writes a G2 point in a slice of bytes
 func writeScalar(dest []byte, x *scalar) {
-	/*C.bn_write_bin((*C.uchar)(&dest[0]),
-		(C.int)(prKeyLengthBLSBLS12381),
-		(*C.Fr)(x),
-	)*/
-	// TODO: to fill
+	C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x))
 }
 
 // writePointG2 writes a G2 point in a slice of bytes
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 7b5ec0508f0..8d5a8ed0a6e 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -81,6 +81,10 @@ typedef struct prec_ {
 // TODO: to delete when Relic is removed
 bn_st* Fr_blst_to_relic(const Fr* x);
 
+int      get_valid();
+int      get_invalid();
+int      get_Fr_BYTES();
+
 // BLS based SPoCK
 int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
 
@@ -88,25 +92,24 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
 void     map_to_G1(ep_t, const byte*, const int);
 
 // Fr utilities
-bool_t Fr_is_zero(const Fr* a);
-bool_t Fr_is_equal(const Fr* a, const Fr* b);
+bool_t      Fr_is_zero(const Fr* a);
+bool_t      Fr_is_equal(const Fr* a, const Fr* b);
+BLST_ERROR  Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
+BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
+void        Fr_write_bytes(uint8_t *bin, const Fr* a);
+bool        map_bytes_to_Fr(Fr*, const uint8_t*, int);
 
 // Utility functions
-int      get_valid();
-int      get_invalid();
-void     bn_new_wrapper(bn_t a);
-
 ctx_t*   relic_init_BLS12_381();
 prec_st* init_precomputed_data_BLS12_381();
 void     precomputed_data_set(const prec_st* p);
 void     seed_relic(byte*, int);
 
-int      ep_read_bin_compact(ep_t, const byte *, const int);
-void     ep_write_bin_compact(byte *, const ep_t,  const int);
-int      ep2_read_bin_compact(ep2_t, const byte *,  const int);
-void     ep2_write_bin_compact(byte *, const ep2_t,  const int);
-BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
-BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
+int         ep_read_bin_compact(ep_t, const byte *, const int);
+void        ep_write_bin_compact(byte *, const ep_t,  const int);
+int         ep2_read_bin_compact(ep2_t, const byte *,  const int);
+void        ep2_write_bin_compact(byte *, const ep2_t,  const int);
+
 
 
 
@@ -116,11 +119,6 @@ void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep2_mult_gen(ep2_t, const Fr*);
 void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
 
-void     bn_randZr(Fr*);
-void     bn_randZr_star(Fr*);
-int      bn_map_to_Zr(Fr*, const uint8_t*, int);
-void     bn_map_to_Zr_star(Fr*, const uint8_t*, int);
-
 void     Fr_sum_vector(Fr*, const Fr*, const int);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 void     ep2_sum_vector(ep2_t, ep2_st*, const int);
@@ -147,6 +145,7 @@ void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int);
 
 // Debugging related functions
 void     bytes_print_(char*, byte*, int);
+void     Fr_print_(char*, Fr*);
 void     fp_print_(char*, fp_t);
 void     bn_print_(char*, bn_st*);
 void     ep_print_(char*, ep_st*);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index ac5fc6ecc93..877eff219e3 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -54,7 +54,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 	_, _ = rand.Read(seed)
 	_ = seedRelic(seed)
 	var expo scalar
-	randZr(&expo) // TODO: upadate
+	randZr(&expo)
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 325203479b2..0e965bac88e 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -8,7 +8,7 @@
 #include "relic.h"
 #include "bls12381_utils.h"
 
-// Signature, Public key and Private key lengths 
+// Signature, Public key and Private key lengths
 #define FULL_SIGNATURE_LEN  G1_BYTES
 #define FULL_PK_LEN         G2_BYTES
 #define SIGNATURE_LEN       (FULL_SIGNATURE_LEN/(G1_SERIALIZATION+1))
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 579700a183e..5e4a13564bd 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -151,7 +151,7 @@ func TestBLSEncodeDecode(t *testing.T) {
 	// specific tests for BLS
 
 	//  zero private key
-	skBytes := make([]byte, PrKeyLenBLSBLS12381)
+	/*skBytes := make([]byte, PrKeyLenBLSBLS12381)
 	sk, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	require.Error(t, err, "decoding identity private key should fail")
 	assert.True(t, IsInvalidInputsError(err))
@@ -195,7 +195,7 @@ func TestBLSEncodeDecode(t *testing.T) {
 	invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D")
 	require.NoError(t, err)
 	_, err = DecodePublicKey(BLSBLS12381, invalidPk2)
-	assert.Error(t, err)
+	assert.Error(t, err)*/
 }
 
 // TestBLSEquals tests equal for BLS keys
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 2721edcd97a..0733bda0b30 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -3,18 +3,29 @@
 #ifndef __BLST_INCLUDE_H__
 #define __BLST_INCLUDE_H__
 
-// blst related definitions
+// extra tools to use BLST low level that are needed by the Flow crypto library
 // eventually this file would replace blst.h
 
 #include "blst.h" // TODO: should be deleted
 #include "point.h"
 #include "consts.h"
 
+// types used by the Flow crypto library that are imported from BLST
+// these type definitions are used as an abstraction from BLST internal types
+
 // field elements F_r
-typedef struct {limb_t limbs[4];} Fr; // also used as vec256;
+typedef struct {limb_t limbs[4];} Fr; // also used as vec256 (little endian limbs)
 // Subroup G1 in E1
 typedef POINTonE1 G1;
 // Subroup G1 in E2
 typedef POINTonE2 G2;
 
+
+// extra functions and tools that are needed by the Flow crypto library 
+// and that are not exported in the desired form by BLST
+
+void pow256_from_be_bytes(pow256 ret, const unsigned char a[32]);
+void vec256_from_be_bytes(vec256 out, const unsigned char *bytes, size_t n);
+
+
 #endif
\ No newline at end of file
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 12bc7b863ca..877c9db7ee5 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -7,10 +7,11 @@ specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>.
 
 While BLST exports multiple functions and tools, the implementation in Flow crypto requires access to low level functions. Some of these tools are not exported by BLST, others would need to be used without paying for the cgo cost, and therefore without using the Go bindings in BLST. 
 
-
 The folder contains:
 - BLST LICENSE file
 - all <blst>/src/*.c and <blst>/src/*.h files (C source files)
 - all <blst>/build   (assembly generated files)
 - <blst>/bindings/blst.h  (headers of external functions)
-- <blst>/bindings/blst_aux.h (headers of external aux functions)
\ No newline at end of file
+- <blst>/bindings/blst_aux.h (headers of external aux functions)
+
+TODO: add steps for upgrading the BLST version
\ No newline at end of file
diff --git a/crypto/blst_tools.c b/crypto/blst_tools.c
new file mode 100644
index 00000000000..dcc1b1171a4
--- /dev/null
+++ b/crypto/blst_tools.c
@@ -0,0 +1,50 @@
+// +build relic
+
+// extra tools to use BLST low level that are needed by the Flow crypto library
+
+#include "blst_include.h"
+#include "bls12381_utils.h"
+
+// internal type of BLST `pow256` uses bytes little endian.
+// input is bytes big endian as used by Flow crypto lib external scalars.
+void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
+{
+    unsigned char* b = (unsigned char*)a + Fr_BYTES - 1;
+    if ((uptr_t)ret == (uptr_t)a) { // swap in place
+        for (int i=0; i<Fr_BYTES/2; i++) {
+            unsigned char tmp = *ret;
+            *(ret++) = *b;
+            *(b--) = tmp;
+        }
+        return;
+    }
+    for (int i=0; i<Fr_BYTES; i++) {
+        *(ret++) = *(b--);
+    }
+}
+
+// maps big-endian bytes into an Fr element using modular reduction
+// output is vec256 (also used as Fr)
+void vec256_from_be_bytes(vec256 out, const unsigned char *bytes, size_t n)
+{
+    // TODO: optimize once working
+    vec256 digit, radix;
+    vec_zero(out, Fr_BYTES);
+    vec_copy(radix, BLS12_381_rRR, sizeof(radix));
+
+    bytes += n;
+    while (n > 32) {
+        limbs_from_be_bytes(digit, bytes -= 32, 32);
+        from_mont_256(digit, digit, BLS12_381_r, r0);
+        mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0);
+        add_mod_256(out, out, digit, BLS12_381_r);
+        mul_mont_sparse_256(radix, radix, BLS12_381_rRR, BLS12_381_r, r0);
+        n -= 32;
+    }
+    limbs_from_be_bytes(digit, bytes -= n, n);
+    from_mont_256(digit, digit, BLS12_381_r, r0);
+    mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0);
+    add_mod_256(out, out, digit, BLS12_381_r);
+
+    vec_zero(digit, sizeof(digit));
+}
\ No newline at end of file
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 76a5aebcd49..221253168cd 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -83,7 +83,6 @@ func (s *feldmanVSSstate) init() {
 	s.y = nil
 	s.xReceived = false
 	s.vAReceived = false
-	C.bn_new_wrapper((*C.Fr)(&s.x))
 }
 
 // Start triggers the protocol start for the current participant.
@@ -274,7 +273,6 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 	generatorScalarMultG2(&s.vA[0], &s.a[0])
 	if s.threshold > 0 {
 		for i := 1; i < s.threshold; i++ {
-			C.bn_new_wrapper((*C.Fr)(&s.a[i]))
 			randZr(&s.a[i])
 			generatorScalarMultG2(&s.vA[i], &s.a[i])
 		}
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 76f343256a4..8a92cd5dff3 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -625,7 +625,6 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 
 		// read the complainer private share
-		C.bn_new_wrapper((*C.Fr)(&s.complaints[complainer].answer))
 		if C.Fr_read_bytes((*C.Fr)(&s.complaints[complainer].answer),
 			(*C.uchar)(&data[1]),
 			PrKeyLenBLSBLS12381,
@@ -649,7 +648,6 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 	// flag check is a sanity check
 	if c.received {
 		// read the complainer private share
-		C.bn_new_wrapper((*C.Fr)(&c.answer))
 		if C.Fr_read_bytes((*C.Fr)(&c.answer),
 			(*C.uchar)(&data[1]),
 			PrKeyLenBLSBLS12381,
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 51733e803fb..be8b2c9f70f 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -304,7 +304,6 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2
 
 	// sum up x
 	var jointx scalar
-	C.bn_new_wrapper((*C.Fr)(&jointx))
 	C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]),
 		(C.int)(qualified))
 	// sum up Y

From e3f4fee85672701dfa6dc84d253cb7ecdf6d974a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 25 Feb 2023 16:31:42 -0600
Subject: [PATCH 012/200] implement BLS threshold signature with new Fr type

---
 crypto/bls.go                      |   2 +-
 crypto/bls12381_hashtocurve.c      |   8 +
 crypto/bls12381_utils.c            | 247 +++++++++++++++++++----------
 crypto/bls12381_utils.h            |  14 +-
 crypto/bls12381_utils_test.go      |  51 +-----
 crypto/bls_core.c                  |  22 +--
 crypto/bls_test.go                 |   4 +-
 crypto/bls_thresholdsign.go        |  20 +--
 crypto/bls_thresholdsign_core.c    | 222 +++++++++++++++-----------
 crypto/bls_thresholdsign_include.h |   8 +-
 crypto/bls_thresholdsign_test.go   |  20 +--
 crypto/blst_include.h              |   8 +-
 crypto/blst_tools.c                |  26 ---
 crypto/dkg.go                      |   3 +-
 crypto/dkg_core.c                  |  56 +++----
 crypto/dkg_feldmanvssq.go          |   3 +-
 crypto/dkg_include.h               |   6 +-
 crypto/dkg_test.go                 |  17 +-
 crypto/thresholdsign.go            |   8 +-
 19 files changed, 393 insertions(+), 352 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 8abee2c9200..48996e0ae9d 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -222,7 +222,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 	case valid:
 		return true, nil
 	default:
-		return false, fmt.Errorf("signature verification failed")
+		return false, fmt.Errorf("signature verification failed: code %d", verif)
 	}
 }
 
diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c
index 229f9c009de..62053d7ed22 100644
--- a/crypto/bls12381_hashtocurve.c
+++ b/crypto/bls12381_hashtocurve.c
@@ -335,4 +335,12 @@ void map_to_G1(ep_t h, const byte* data, const int len) {
     #elif hashToPoint==RELIC_SSWU
     ep_map_from_field(h, data, len);
     #endif
+
+    /*Fr a, b;
+    Fr_set_limb(&a, 1);
+    Fr_print_("a", &a);
+    Fr_inv_montg_eucl(&b,&a);
+    Fr_print_("b", &b);
+    Fr_from_montg(&b, &b);
+    Fr_print_("b", &b); */  
 }
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index d71e583daf0..c8dfb808827 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -8,6 +8,23 @@
 #include "bls_include.h"
 #include "assert.h"
 
+// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
+
+// return macro values to the upper Go Layer
+int get_valid() {
+    return VALID;
+}
+
+int get_invalid() {
+    return INVALID;
+}
+
+int get_Fr_BYTES() {
+    return Fr_BYTES;
+}
+
+// Fr utilities
+
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
     bn_st* out = (bn_st*)malloc(sizeof(bn_st)); 
@@ -18,19 +35,158 @@ bn_st* Fr_blst_to_relic(const Fr* x) {
     return out;
 }
 
-// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
+// returns true if a == 0 and false otherwise
+bool_t Fr_is_zero(const Fr* a) {
+    return bytes_are_zero((const byte*)a, Fr_BYTES);
+}
 
-// return macro values to the upper Go Layer
-int get_valid() {
-    return VALID;
+// returns true if a == b and false otherwise
+bool_t Fr_is_equal(const Fr* a, const Fr* b) {
+    return vec_is_equal(a, b, Fr_BYTES);
 }
 
-int get_invalid() {
-    return INVALID;
+// sets `a` to limb `l`
+void Fr_set_limb(Fr* a, const limb_t l){
+    vec_zero((byte*)a + sizeof(limb_t), Fr_BYTES - sizeof(limb_t));
+    *((limb_t*)a) = l;
 }
 
-int get_Fr_BYTES() {
-    return Fr_BYTES;
+void Fr_copy(Fr* res, Fr* a) {
+    vec_copy((byte*)res, (byte*)a, Fr_BYTES);
+}
+
+// sets `a` to 0
+void Fr_set_zero(Fr* a){
+    vec_zero((byte*)a, Fr_BYTES);
+}
+
+void Fr_add(Fr *res, const Fr *a, const Fr *b) {
+    add_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r);
+}
+
+void Fr_sub(Fr *res, const Fr *a, const Fr *b) {
+    sub_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r);
+}
+
+void Fr_neg(Fr *res, const Fr *a) {
+    cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r);
+}
+
+void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) {
+    mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0);
+}
+
+void Fr_to_montg(Fr *res, const Fr *a) {
+    mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0);
+}
+
+void Fr_from_montg(Fr *res, const Fr *a) {
+    from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0);
+}
+
+// result is in Montgomery form
+// res = a^(-1)*R
+void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
+    // copied and modified from BLST code
+    // Copyright Supranational LLC
+    static const vec256 rx2 =   { /* left-aligned value of the modulus */
+        TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd),
+        TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90),
+    };
+    vec512 temp;
+    ct_inverse_mod_256(temp, (limb_t*)a, BLS12_381_r, rx2);
+    redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0);
+}
+
+void Fr_inv_montg_expo(Fr *res, const Fr *a) {
+    // TODO:
+}
+
+// computes the sum of the array elements and writes the sum in jointx
+void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) {
+    Fr_set_zero(jointx);
+    for (int i=0; i<len; i++) {
+        Fr_add(jointx, jointx, &x[i]);
+    }
+}
+
+// reads a scalar in `a` and checks it is a valid Fr element (a < r).
+// input bytes are big endian.
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fr
+//    - v if the scalar is valid 
+BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
+    if (len != Fr_BYTES) {
+        return BLST_BAD_ENCODING;
+    }
+    pow256 tmp;
+    pow256_from_be_bytes(tmp, bin);
+    if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
+        return BLST_BAD_SCALAR;
+    }
+    vec_zero(tmp, Fr_BYTES);
+    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
+    return BLST_SUCCESS;
+}
+
+// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
+// input bytes are big endian.
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
+//    - BLST_SUCCESS if the scalar is valid 
+BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
+    int ret = Fr_read_bytes(a, bin, len);
+    if (ret != BLST_SUCCESS) {
+        return ret;
+    }
+    // check if a=0
+    if (Fr_is_zero(a)) {
+        return BLST_BAD_SCALAR;
+    }
+    return BLST_SUCCESS;
+}
+
+// write Fr element `a` in big endian bytes.
+void Fr_write_bytes(uint8_t *bin, const Fr* a) {
+    be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
+}
+
+// maps big-endian bytes into an Fr element using modular reduction
+// output is vec256 (also used as Fr)
+static void 
+vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
+{
+    Fr digit, radix;
+    Fr_set_zero(out);
+    Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2
+
+    bytes += n;
+    while (n > Fr_BYTES) {
+        limbs_from_be_bytes((limb_t*)&digit, bytes -= Fr_BYTES, Fr_BYTES); // l_i
+        Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i  (i is the loop number starting at 1)
+        Fr_add(out, out, &digit);
+        Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1)
+        n -= Fr_BYTES;
+    }
+    Fr_set_zero(&digit);
+    limbs_from_be_bytes((limb_t*)&digit, bytes -= n, n);
+    Fr_mul_montg(&digit, &digit, &radix);
+    Fr_add(out, out, &digit);
+    // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n
+    // reduce the extra R
+    Fr_from_montg(out, out);
+    // clean up possible sensitive data
+    Fr_set_zero(&digit);
+}
+
+// Reads a scalar from an array and maps it to Fr.
+// It returns true if scalar is zero and false otherwise.
+bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
+    vec256_from_be_bytes(a, bin, len);
+    //Fr_set_limb(a, 1); TODO: delete
+    return Fr_is_zero(a);
 }
 
 // global variable of the pre-computed data
@@ -160,7 +316,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
 void ep2_mult_gen(ep2_t res, const Fr* expo) {
     bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using precomputed table of size 4
-    g2_mul_gen(res, (bn_st*)tmp_expo);
+    g2_mul_gen(res, tmp_expo);
 }
 
 // DEBUG printing functions 
@@ -204,13 +360,6 @@ void ep2_print_(char* s, ep2_st* p) {
     g2_print(p);
 }
 
-// Reads a scalar from an array and maps it to Fr.
-// It returns true if scalar is zero and false otherwise.
-bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
-    vec256_from_be_bytes((limb_t*)a, bin, len);
-    return Fr_is_zero(a);
-}
-
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 static int fp_get_sign(const fp_t y) {
@@ -513,72 +662,6 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
     return RLC_ERR;
 }
 
-bool_t Fr_is_zero(const Fr* a) {
-    return bytes_are_zero((const byte*)a, Fr_BYTES);
-}
-
-bool_t Fr_is_equal(const Fr* a, const Fr* b) {
-    return vec_is_equal(a, b, Fr_BYTES);
-}
-
-// reads a scalar in `a` and checks it is a valid Fr element (a < r).
-// input bytes are big endian.
-// returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fr
-//    - v if the scalar is valid 
-BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
-    if (len != Fr_BYTES) {
-        return BLST_BAD_ENCODING;
-    }
-    pow256 tmp;
-    pow256_from_be_bytes(tmp, bin);
-    if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
-        return BLST_BAD_SCALAR;
-    }
-    vec_zero(tmp, Fr_BYTES);
-    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
-    return BLST_SUCCESS;
-}
-
-// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
-// input bytes are big endian.
-// returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
-//    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
-    int ret = Fr_read_bytes(a, bin, len);
-    if (ret != BLST_SUCCESS) {
-        return ret;
-    }
-    // check if a=0
-    if (Fr_is_zero(a)) {
-        return BLST_BAD_SCALAR;
-    }
-    return BLST_SUCCESS;
-}
-
-// write Fr element `a` in big endian bytes.
-void Fr_write_bytes(uint8_t *bin, const Fr* a) {
-    be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
-}
-
-// computes the sum of the array elements x and writes the sum in jointx
-// the sum is computed in Fr
-void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) {
-    /*bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
-    bn_set_dig(jointx, 0);
-    bn_new_size(jointx, BITS_TO_DIGITS(Fr_BITS+1));
-    for (int i=0; i<len; i++) {
-        bn_add(jointx, jointx, &x[i]);
-        if (bn_cmp(jointx, r) == RLC_GT) 
-            bn_sub(jointx, jointx, r);
-    }
-    bn_free(r);*/
-}
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
 void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 8d5a8ed0a6e..63fad43adc4 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -94,6 +94,18 @@ void     map_to_G1(ep_t, const byte*, const int);
 // Fr utilities
 bool_t      Fr_is_zero(const Fr* a);
 bool_t      Fr_is_equal(const Fr* a, const Fr* b);
+void        Fr_set_limb(Fr*, const limb_t);
+void        Fr_copy(Fr*, Fr*);
+void        Fr_set_zero(Fr*);
+void        Fr_add(Fr *res, const Fr *a, const Fr *b);
+void        Fr_sub(Fr *res, const Fr *a, const Fr *b);
+void        Fr_neg(Fr *res, const Fr *a);
+void        Fr_sum_vector(Fr*, const Fr x[], const int);
+void        Fr_mul_montg(Fr *res, const Fr *a, const Fr *b);
+void        Fr_to_montg(Fr *res, const Fr *a);
+void        Fr_from_montg(Fr *res, const Fr *a);
+void        Fr_inv_montg_eucl(Fr *res, const Fr *a);
+void        Fr_inv_montg_expo(Fr *res, const Fr *a);
 BLST_ERROR  Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
 BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
 void        Fr_write_bytes(uint8_t *bin, const Fr* a);
@@ -119,7 +131,6 @@ void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep2_mult_gen(ep2_t, const Fr*);
 void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
 
-void     Fr_sum_vector(Fr*, const Fr*, const int);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 void     ep2_sum_vector(ep2_t, ep2_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
@@ -128,7 +139,6 @@ void     ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len);
 // membership checks
 int      check_membership_G1(const ep_t);
 int      check_membership_G2(const ep2_t);
-int      check_membership_Fr_star(const bn_t);
 
 int      simple_subgroup_check_G1(const ep_t);
 int      simple_subgroup_check_G2(const ep2_t);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 877eff219e3..4662aa9567f 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -12,41 +12,6 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
-func TestDeterministicKeyGen(t *testing.T) {
-	// 2 keys generated with the same seed should be equal
-	seed := make([]byte, KeyGenSeedMinLen)
-	n, err := rand.Read(seed)
-	require.Equal(t, n, KeyGenSeedMinLen)
-	require.NoError(t, err)
-	sk1, err := GeneratePrivateKey(BLSBLS12381, seed)
-	require.Nil(t, err)
-	sk2, err := GeneratePrivateKey(BLSBLS12381, seed)
-	require.Nil(t, err)
-	assert.True(t, sk1.Equals(sk2), "private keys should be equal")
-}
-
-// test the deterministicity of the relic PRG (used by the DKG polynomials)
-/*func TestPRGseeding(t *testing.T) {
-
-	// 2 scalars generated with the same seed should be equal
-	seed := make([]byte, KeyGenSeedMinLen)
-	n, err := rand.Read(seed)
-	require.Equal(t, n, KeyGenSeedMinLen)
-	require.NoError(t, err)
-	// 1st scalar (wrapped in a private key)
-	err = seedRelic(seed)
-	require.Nil(t, err)
-	var sk1 prKeyBLSBLS12381
-	randZr(sk1.scalar)
-	// 2nd scalar (wrapped in a private key)
-	err = seedRelic(seed)
-	require.Nil(t, err)
-	var sk2 prKeyBLSBLS12381
-	randZr(sk2.scalar)
-	// compare the 2 scalars (by comparing the private keys)
-	assert.True(t, sk1.Equals(&sk2), "private keys should be equal")
-}*/
-
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMultG1G2(b *testing.B) {
 
@@ -95,18 +60,18 @@ func TestMapToG1(t *testing.T) {
 
 	msgs := [][]byte{
 		[]byte{},
-		[]byte("abc"),
-		[]byte("abcdef0123456789"),
-		[]byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"),
-		[]byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
+		//[]byte("abc"),
+		//[]byte("abcdef0123456789"),
+		//[]byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"),
+		//[]byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
 	}
 
 	expectedPointString := []string{
 		"052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1",
-		"03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903",
-		"11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98",
-		"15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488",
-		"082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe",
+		//"03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903",
+		//"11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98",
+		//"15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488",
+		//"082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe",
 	}
 
 	for i, msg := range msgs {
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index cdfc6aaf7f1..ecec6ae346c 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -19,15 +19,6 @@ int get_sk_len() {
     return SK_LEN;
 }
 
-// checks an input scalar a satisfies 0 < a < r
-// where (r) is the order of G1/G2
-int check_membership_Fr_star(const bn_t a){
-    if (bn_cmp(a, &core_get()->ep_r) != RLC_LT || bn_cmp_dig(a, 0) != RLC_GT) {
-        return INVALID; 
-    }
-    return VALID;
-}
-
 // Checks if input point p is in the subgroup G1. 
 // The function assumes the input is known to be on the curve E1.
 int check_membership_G1(const ep_t p){
@@ -93,8 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 // and a message data.
 // The signature and public key are assumed to be in G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) { 
-    
+static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) {     
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -109,7 +99,7 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i
 
     // elemsG2[1] = pk
     ep2_new(elemsG2[1]);
-    ep2_copy(elemsG2[1], (ep2_st*)pk);
+    ep2_copy(elemsG2[1], (ep2_st*)pk); 
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -118,12 +108,14 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i
 
     fp12_t pair;
     fp12_new(&pair);
+    if (core_get()->code != RLC_OK) printf("EUUUUUUUU\n");
     // double pairing with Optimal Ate 
     pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2);
 
     // compare the result to 1
     int res = fp12_cmp_dig(pair, 1);
 
+
 #elif SINGLE_PAIRING   
     fp12_t pair1, pair2;
     fp12_new(&pair1); fp12_new(&pair2);
@@ -342,12 +334,14 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len)
     
     // deserialize the signature into a curve point
     int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN);
-    if (read_ret != RLC_OK) 
+    if (read_ret != RLC_OK) {
         return read_ret;
+    }
 
     // check s is in G1
-    if (check_membership_G1(s) != VALID) // only enabled if MEMBERSHIP_CHECK==1
+    if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
         return INVALID;
+    }
     
     return bls_verify_ep(pk, s, data, len);
 }
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 5e4a13564bd..579700a183e 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -151,7 +151,7 @@ func TestBLSEncodeDecode(t *testing.T) {
 	// specific tests for BLS
 
 	//  zero private key
-	/*skBytes := make([]byte, PrKeyLenBLSBLS12381)
+	skBytes := make([]byte, PrKeyLenBLSBLS12381)
 	sk, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	require.Error(t, err, "decoding identity private key should fail")
 	assert.True(t, IsInvalidInputsError(err))
@@ -195,7 +195,7 @@ func TestBLSEncodeDecode(t *testing.T) {
 	invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D")
 	require.NoError(t, err)
 	_, err = DecodePublicKey(BLSBLS12381, invalidPk2)
-	assert.Error(t, err)*/
+	assert.Error(t, err)
 }
 
 // TestBLSEquals tests equal for BLS keys
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index df3da1a108d..ef4630a7341 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -5,7 +5,7 @@ package crypto
 
 // #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
-/*import "C"
+import "C"
 
 import (
 	"fmt"
@@ -409,14 +409,11 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 	signers := make([]index, 0, len(s.shares))
 	for index, share := range s.shares {
 		shares = append(shares, share...)
-		signers = append(signers, index)
+		signers = append(signers, index+1)
 	}
 
-
-
-
 	// Lagrange Interpolate at point 0
-	result := C.G1_lagrangeInterpolateAtZero(
+	result := C.G1_lagrangeInterpolateAtZero_serialized(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&shares[0]),
 		(*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1))
@@ -457,8 +454,6 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 func BLSReconstructThresholdSignature(size int, threshold int,
 	shares []Signature, signers []int) (Signature, error) {
 
-
-
 	if size < ThresholdSignMinSize || size > ThresholdSignMaxSize {
 		return nil, invalidInputsErrorf(
 			"size should be between %d and %d",
@@ -501,12 +496,12 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 				"%d is a duplicate signer", index(signers[i]))
 		}
 		m[index(signers[i])] = true
-		indexSigners = append(indexSigners, index(signers[i]))
+		indexSigners = append(indexSigners, index(signers[i])+1)
 	}
 
 	thresholdSignature := make([]byte, signatureLengthBLSBLS12381)
 	// Lagrange Interpolate at point 0
-	if C.G1_lagrangeInterpolateAtZero(
+	if C.G1_lagrangeInterpolateAtZero_serialized(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&flatShares[0]),
 		(*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1),
@@ -558,9 +553,6 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 			threshold)
 	}
 
-
-
-
 	// the scalars x and G2 points y
 	x := make([]scalar, size)
 	y := make([]pointG2, size)
@@ -604,4 +596,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	// are sampled uniformly at random. The probability of
 	// generating an identity key is therefore negligible.
 	return skShares, pkShares, pkGroup, nil
-}*/
+}
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 94a12a024d7..68f5005ace4 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -2,122 +2,154 @@
 
 #include "bls_thresholdsign_include.h"
 
-// Computes the Lagrange coefficient L(i+1) at 0 with regards to the range [signers(0)+1..signers(t)+1]
-// and stores it in res, where t is the degree of the polynomial P
-static void Zr_lagrangeCoefficientAtZero(bn_t res, const int i, const uint8_t* signers, const int len){
-    // r is the order of G1 and G2
-    bn_t r, r_2;
-    bn_new(r);
-    g2_get_ord(r);
-    // (r-2) is needed to compute the inverse in Fr
-    // using little Fermat theorem
-    bn_new(r_2);
-    bn_sub_dig(r_2, r, 2);
-    //#define MOD_METHOD MONTY
-    #define MOD_METHOD BASIC
-
-    #if MOD_METHOD == MONTY   
-    bn_t u;
-    bn_new(u)
-    // Montgomery reduction constant
-    // TODO: hardcode u
-    bn_mod_pre_monty(u, r);
-    #endif
+// the highest index of a threshold participant
+#define MAX_IND         255
+#define MAX_IND_BITS    8   // equal to ceiling(log_2(MAX_IND))
+
+// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)]
+// and stores it in `res`, where t is the degree of the polynomial P.
+// `len` is equal to `t+1` where `t` is the polynomial degree.
+static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t indices[], const int len){
+
+    // coefficient is computed as N * D^(-1)
+    Fr numerator;  // eventually would represent N*R^k  
+    Fr denominator; // eventually would represent D*R^k 
+
+    // Initialize N and D to Montgomery constant R
+    // TODO: hardcode R and add Fr_copy function
+    Fr_copy(&numerator, (Fr*)BLS12_381_rRR);
+    Fr_copy(&denominator, (Fr*)BLS12_381_rRR);
+    Fr_from_montg(&numerator, &numerator);
+    Fr_from_montg(&denominator, &denominator);
+
+    // sign of D: 1 for positive and 0 for negative
+    int sign = 1; 
+
+    // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS)
+    // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing.
+    #define MAX_IND_LOOPS   64/MAX_IND_BITS
+
+    // choose inversion algorithm used for denominator
+    #define FERMAT_INVERSION 0
+    #define EUCLIDEAN_INVERSION (FERMAT_INVERSION^1)
 
-    // temp buffers
-    bn_t acc, inv, base, numerator;
-    bn_new(inv);
-    bn_new(base);
-    bn_new_size(base, BITS_TO_DIGITS(Fr_BITS))
-    bn_new(acc);
-    bn_new(numerator);
-    bn_new_size(acc, BITS_TO_DIGITS(3*Fr_BITS));
-
-    // the accumulator of the largarnge coeffiecient 
-    // the sign (sign of acc) is equal to 1 if acc is positive, 0 otherwise
-    bn_set_dig(acc, 1);
-    int sign = 1;
-
-    // loops is the maximum number of loops that takes the accumulator to 
-    // overflow modulo r, mainly the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < r
     const int loops = MAX_IND_LOOPS;
     int k,j = 0;
+    Fr tmp;
     while (j<len) {
-        bn_set_dig(base, 1);
-        bn_set_dig(numerator, 1);
-        for (k = j; j < MIN(len, k+loops); j++){
-            if (signers[j]==i) 
+        limb_t limb_numerator = 1;
+        limb_t limb_denominator = 1;
+        for (k = j; j < MIN(len, k+loops); j++){ // batch up to `loops` elements in one limb
+            if (j==i) 
                 continue;
-            if (signers[j]<i) 
+            if (indices[j] < indices[i]) {
                 sign ^= 1;
-            bn_mul_dig(base, base, abs((int)signers[j]-i));
-            bn_mul_dig(numerator, numerator, signers[j]+1);
+                limb_denominator *= indices[i]-indices[j];
+            } else {
+                limb_denominator *= indices[j]-indices[i];
+            }
+            limb_numerator *= indices[j];
         }
-        // compute the inverse using little Fermat theorem
-        bn_mxp_slide(inv, base, r_2, r);
-        #if MOD_METHOD == MONTY 
-        // convert to Montgomery domain
-        bn_mod_monty_conv(inv, inv, r);
-        bn_mod_monty_conv(numerator, numerator, r);
-        bn_mod_monty_conv(acc, acc, r);
-        // multiply
-        bn_mul(acc, acc, inv);
-        bn_mod_monty(acc, acc, r, u);
-        bn_mul(acc, acc, numerator);
-        bn_mod_monty(acc, acc, r, u);
-        bn_mod_monty_back(acc, acc, r);
-        #elif MOD_METHOD == BASIC 
-        bn_mul(acc, acc, inv);
-        bn_mul(acc, acc, numerator);
-        bn_mod_basic(acc, acc, r);
+        // update numerator
+        Fr_set_limb(&tmp, limb_numerator); // L_N
+        #if EUCLIDEAN_INVERSION == 1 
+            // numerator and denominator are both computed in Montgomery form.
+            Fr_to_montg(&tmp, &tmp);  // L_N*R
+        #endif
+        Fr_mul_montg(&numerator, &numerator, &tmp); // N*R
+        // update denominator
+        Fr_set_limb(&tmp, limb_denominator); // L_D
+        #if EUCLIDEAN_INVERSION == 1 
+            // keep numertaor and denominator are both computed in Montgomery form.
+            Fr_to_montg(&tmp, &tmp);  // L_D*R
         #endif
+        Fr_mul_montg(&denominator, &denominator, &tmp); // D*R
+        //printf("%d--%lld--%lld\n", sign, limb_numerator, limb_denominator);
+    }
+    if (!sign) {
+        Fr_neg(&denominator, &denominator);
     }
-    if (sign) bn_copy(res, acc);
-    else bn_sub(res, r, acc);
 
-    // free the temp memory
-    bn_free(r);bn_free(r_1);
-    #if MOD_METHOD == MONTY   
-    bn_free(&u);
+    #if EUCLIDEAN_INVERSION == 1 
+        // at this point, denominator = D*R , numertaor = N*R
+        // inversion  
+        Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1)
+        Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1)     
+    #endif
+
+    //printf("%d:LI(%d):\n", i, indices[i]);
+    //Fr_print_("res", res);
+
+     #if FERMAT_INVERSION == 1 
+        // at this point, denominator = D*R^c , numertaor = N*R^c  
+        // (c is the nummber of mult_mont, but the exact value isn't relevant)
+        // inversion inv(xR) = x^(-1)R
+        Fr_inv_montg_expo(&denominator, &denominator); // inv(D*R^c) = inv(D*R^(c-1)*R) = D^(-1)*R^(1-c)*R
+        Fr_mul_montg(&numerator, &numerator, &denominator); //N*D^(-1)*R
+        Fr_from_montg(res, &numerator); //N*D^(-1)
     #endif
-    bn_free(acc);
-    bn_free(inv);bn_free(base);
-    bn_free(numerator);
 }
 
 
-// Computes the Langrange interpolation at zero LI(0) with regards to the points [signers(1)+1..signers(t+1)+1] 
-// and their images [shares(1)..shares(t+1)], and stores the result in dest
-// len is the polynomial degree 
-int G1_lagrangeInterpolateAtZero(byte* dest, const byte* shares, const uint8_t* signers, const int len) {
-    // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
-    // powers of x
-    bn_t bn_lagr_coef;
-    bn_new(bn_lagr_coef);
-    bn_new_size(bn_lagr_coef, BITS_TO_BYTES(Fr_BITS));
+// Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
+// and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
+// `len` is equal to `t+1` where `t` is the polynomial degree.
+static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) {
+    // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1 
+    // where A_i = g1 ^ a_i
+
+    // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0)
+    // where L is the Lagrange coefficient
     
     // temp variables
-    ep_t mult, acc, share;
+    ep_t mult;
     ep_new(mult);         
-    ep_new(acc);
-    ep_new(share);
-    ep_set_infty(acc);
+    ep_set_infty(dest);
+
+    Fr fr_lagr_coef;
+    for (int i=0; i < len; i++) {
+        Fr_lagrangeCoefficientAtZero(&fr_lagr_coef, i, indices, len);
+        bn_st* bn_lagr_coef = Fr_blst_to_relic(&fr_lagr_coef);
+        ep_mul_lwnaf(mult, &shares[i], bn_lagr_coef);
+        free(bn_lagr_coef);
+        ep_add_jacob(dest, dest, mult);
+    }
+    // free the temp memory
+    ep_free(mult);
+}
+
+// Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] 
+// and their G1 concatenated serializations [shares(1)..shares(t+1)], and stores the serialized result in `dest`.
+// `len` is equal to `t+1` where `t` is the polynomial degree.
+int G1_lagrangeInterpolateAtZero_serialized(byte* dest, const byte* shares, const uint8_t indices[], const int len) {
+    int read_ret;
+    // temp variables
+    ep_t res;
+    ep_new(res);
+    ep_st* ep_shares = malloc(sizeof(ep_t) * len);
 
     for (int i=0; i < len; i++) {
-        int read_ret = ep_read_bin_compact(share, &shares[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (read_ret != RLC_OK)
-            return read_ret;
-        Zr_lagrangeCoefficientAtZero(bn_lagr_coef, signers[i], signers, len);
-        ep_mul_lwnaf(mult, share, bn_lagr_coef);
-        ep_add_jacob(acc, acc, mult);
+        ep_new(ep_shares[i]);
+        read_ret = ep_read_bin_compact(&ep_shares[i], &shares[SIGNATURE_LEN*i], SIGNATURE_LEN);
+        if (read_ret != RLC_OK) goto out;
+            
     }
+    // G1 interpolation at 0
+    // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
+    // where A_i = g1 ^ a_i
+    G1_lagrangeInterpolateAtZero(res, ep_shares, indices, len);
+
     // export the result
-    ep_write_bin_compact(dest, acc, SIGNATURE_LEN);
+    ep_write_bin_compact(dest, res, SIGNATURE_LEN);
+    read_ret = VALID;
 
+out:
     // free the temp memory
-    ep2_free(acc);
-    ep2_free(mult);
-    ep2_free(share);
-    bn_free(bn_lagr_coef);
-    return VALID;
+    ep_free(res);
+    for (int i=0; i < len; i++) {
+        ep_free(ep_shares[i]);
+    } 
+    free(ep_shares); 
+    return read_ret;
 }
+
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 9b3a700fc96..b3e68f46328 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -5,11 +5,7 @@
 
 #include "bls_include.h"
 
-// the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < r 
-// (approximately Fr_bits/MAX_IND_BITS)
-#define MAX_IND_LOOPS   32 
-
-int G1_lagrangeInterpolateAtZero(byte*, const byte* , const uint8_t*, const int);
-extern void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int);
+extern void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 33137601e89..3d05177369c 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 import (
 	"crypto/rand"
 	"fmt"
@@ -22,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) {
 	t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI)
 	// stateful API
 	t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI)
-	t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
-	t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
+	//t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
+	//t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
 }
 
 const thresholdSignatureTag = "random tag"
@@ -315,7 +314,7 @@ func testCentralizedStatefulAPI(t *testing.T) {
 
 // Distributed Threshold Signature stateful api test
 // keys are generated using simple Feldman VSS
-func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) {
+/*func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
 	log.Info("DKG starts")
 	gt = t
@@ -440,7 +439,7 @@ func testDistributedStatefulAPI_JointFeldman(t *testing.T) {
 		// synchronize the main thread to end TS
 		sync.Wait()
 	}
-}
+}*/
 
 // This is a testing function
 // It simulates processing incoming messages by a participant during DKG
@@ -547,7 +546,9 @@ func testCentralizedStatelessAPI(t *testing.T) {
 	n := 10
 	for threshold := MinimumThreshold; threshold < n; threshold++ {
 		// generate threshold keys
-		mrand.Seed(time.Now().UnixNano())
+		r := int64(1677308758239641000) //time.Now().UnixNano()
+		mrand.Seed(r)
+		t.Log(r)
 		seed := make([]byte, SeedMinLenDKG)
 		_, err := mrand.Read(seed)
 		require.NoError(t, err)
@@ -606,7 +607,7 @@ func testCentralizedStatelessAPI(t *testing.T) {
 	}
 }
 
-func BenchmarkSimpleKeyGen(b *testing.B) {
+/*func BenchmarkSimpleKeyGen(b *testing.B) {
 	n := 60
 	seed := make([]byte, SeedMinLenDKG)
 	_, _ = rand.Read(seed)
@@ -615,5 +616,6 @@ func BenchmarkSimpleKeyGen(b *testing.B) {
 		_, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed)
 	}
 	b.StopTimer()
-}
-*/
+}*/
+
+// TODO: add benchmark for signature reconstruction
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 0733bda0b30..0fd710d1579 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -14,7 +14,11 @@
 // these type definitions are used as an abstraction from BLST internal types
 
 // field elements F_r
-typedef struct {limb_t limbs[4];} Fr; // also used as vec256 (little endian limbs)
+// where `r` is the order of G1/G2.
+// F_r elements are represented as big numbers reduced modulo `r`. Big numbers
+// are represented as a little endian vector of limbs.
+// `Fr` is equivalent to type vec256 (used internally by BLST for F_r elements).
+typedef struct {limb_t limbs[4];} Fr;
 // Subroup G1 in E1
 typedef POINTonE1 G1;
 // Subroup G1 in E2
@@ -25,7 +29,5 @@ typedef POINTonE2 G2;
 // and that are not exported in the desired form by BLST
 
 void pow256_from_be_bytes(pow256 ret, const unsigned char a[32]);
-void vec256_from_be_bytes(vec256 out, const unsigned char *bytes, size_t n);
-
 
 #endif
\ No newline at end of file
diff --git a/crypto/blst_tools.c b/crypto/blst_tools.c
index dcc1b1171a4..81fba31ac9e 100644
--- a/crypto/blst_tools.c
+++ b/crypto/blst_tools.c
@@ -21,30 +21,4 @@ void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
     for (int i=0; i<Fr_BYTES; i++) {
         *(ret++) = *(b--);
     }
-}
-
-// maps big-endian bytes into an Fr element using modular reduction
-// output is vec256 (also used as Fr)
-void vec256_from_be_bytes(vec256 out, const unsigned char *bytes, size_t n)
-{
-    // TODO: optimize once working
-    vec256 digit, radix;
-    vec_zero(out, Fr_BYTES);
-    vec_copy(radix, BLS12_381_rRR, sizeof(radix));
-
-    bytes += n;
-    while (n > 32) {
-        limbs_from_be_bytes(digit, bytes -= 32, 32);
-        from_mont_256(digit, digit, BLS12_381_r, r0);
-        mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0);
-        add_mod_256(out, out, digit, BLS12_381_r);
-        mul_mont_sparse_256(radix, radix, BLS12_381_rRR, BLS12_381_r, r0);
-        n -= 32;
-    }
-    limbs_from_be_bytes(digit, bytes -= n, n);
-    from_mont_256(digit, digit, BLS12_381_r, r0);
-    mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0);
-    add_mod_256(out, out, digit, BLS12_381_r);
-
-    vec_zero(digit, sizeof(digit));
 }
\ No newline at end of file
diff --git a/crypto/dkg.go b/crypto/dkg.go
index 3e369b77fa4..1cdf87a128e 100644
--- a/crypto/dkg.go
+++ b/crypto/dkg.go
@@ -1,6 +1,5 @@
 package crypto
 
-/*
 import (
 	"errors"
 	"fmt"
@@ -235,4 +234,4 @@ type DKGProcessor interface {
 	// do so, the protocol can be broken.
 	// log describes the misbehavior.
 	FlagMisbehavior(participant int, log string)
-}*/
+}
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 50923ee9087..fa4729c84e2 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -6,51 +6,37 @@
 #define N_max 250
 #define N_bits_max 8  // log(250)  
 #define T_max  ((N_max-1)/2)
-
+/*
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r)
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
 void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){
-    bn_t image;
-    bn_new(image);
-    Fr_polynomialImage(image, y, a, a_size, x);
+    Fr image;
+    Fr_polynomialImage(&image, y, a, a_size, x);
     // exports the result
-    const int out_size = Fr_BYTES;
-    bn_write_bin(out, out_size, image);
-    bn_free(image);
-}
+    Fr_write_bytes(out, &image);
+}*/
 
-// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r)
-// r being the order of G1
-// writes P(x) in out and P(x).g2 in y if y is non NULL
-// x being a small integer
-void Fr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){
-    bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
-    
-    // temp variables
-    bn_t acc;
-    bn_new(acc); 
-    bn_new_size(acc, BITS_TO_DIGITS(Fr_BITS+8+1));
-    bn_set_dig(acc, 0);
+// computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
+// a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255.
+// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL
+void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const byte x){
+    Fr_set_zero(image); 
+    // convert `x` to Montgomery form
+    Fr xR;
+    Fr_set_limb(&xR, (limb_t)x);
+    Fr_to_montg(&xR, &xR);
 
-    for (int i=a_size-1; i >= 0; i--) {
-        bn_mul_dig(acc, acc, x);
-        // Use basic reduction as it's an 9-bits reduction 
-        // in the worst case (|acc|<|r|+9 )
-        bn_mod_basic(acc, acc, r);
-        bn_add(acc, acc, &a[i]);
+    for (int i = a_size-1; i >= 0; i--) {
+        Fr_mul_montg(image, image, &xR); 
+        Fr_add(image, image, &a[i]); // image is in normal form
     }
-    // export the result
-    bn_mod_basic(image, acc, r);
-
     // compute y = P(x).g2
-    if (y) g2_mul_gen(y, acc);
-
-    bn_free(acc)
-    bn_free(r);
+    if (y) {
+        bn_st* tmp = Fr_blst_to_relic(image);
+        g2_mul_gen(y, tmp);
+    }
 }
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 8a92cd5dff3..2e7688b11fa 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -203,13 +203,14 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error)
 		return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid")
 	}
 	return x, Y, y, nil
-}
+}*/
 
 const (
 	complaintSize       = 1
 	complaintAnswerSize = 1 + PrKeyLenBLSBLS12381
 )
 
+/*
 // HandleBroadcastMsg processes a new broadcasted message received by the current participant.
 // orig is the message origin index
 //
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index f50b143961d..34c81053fa7 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,12 +5,8 @@
 
 #include "bls12381_utils.h"
 
-// the highest index of a DKG participant
-#define MAX_IND         255
-#define MAX_IND_BITS    8
-
 void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x);
-void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
 void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A);
 void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len);
 int  ep2_vector_read_bin(ep2_st* A, const byte* src, const int len);
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index 104cb8ef56f..fc1de49d225 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -3,26 +3,26 @@
 
 package crypto
 
-/*
 import (
 	"fmt"
 	mrand "math/rand"
 	"sync"
 	"testing"
-	"time"
+	_ "time"
 
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
+	_ "github.com/stretchr/testify/require"
 )
 
 var gt *testing.T
 
+/*
 func TestDKG(t *testing.T) {
 	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
 	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
 	t.Run("JointFeldman", testJointFeldman)
-}
+}*/
 
 // optimal threshold (t) to allow the largest number of malicious participants (m)
 // assuming the protocol requires:
@@ -33,6 +33,7 @@ func optimalThreshold(size int) int {
 	return (size - 1) / 2
 }
 
+/*
 // Testing the happy path of Feldman VSS by simulating a network of n participants
 func testFeldmanVSSSimple(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
@@ -43,7 +44,7 @@ func testFeldmanVSSSimple(t *testing.T) {
 			dkgCommonTest(t, feldmanVSS, n, threshold, happyPath)
 		})
 	}
-}
+}*/
 
 type testCase int
 
@@ -68,7 +69,7 @@ const (
 	invalidSharesComplainTrigger
 	invalidComplaintAnswerBroadcast
 	duplicatedSendAndBroadcast
-)
+) /*
 
 // Testing Feldman VSS with the qualification system by simulating a network of n participants
 func testFeldmanVSSQual(t *testing.T) {
@@ -441,7 +442,7 @@ func timeoutPostProcess(processors []testDKGProcessor, t *testing.T, phase int)
 			}(i)
 		}
 	}
-}
+}*/
 
 // implements DKGProcessor interface
 type testDKGProcessor struct {
@@ -767,7 +768,7 @@ func TestDKGErrorTypes(t *testing.T) {
 		assert.False(t, IsDKGInvalidStateTransitionError(otherError))
 		assert.False(t, IsDKGInvalidStateTransitionError(nil))
 	})
-}
+} /*
 
 func TestDKGTransitionErrors(t *testing.T) {
 	n := 5
diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go
index ebb814dee5b..2dae7061b76 100644
--- a/crypto/thresholdsign.go
+++ b/crypto/thresholdsign.go
@@ -16,10 +16,10 @@ import (
 // the input threshold value (t) should be set to t = floor((n-1)/2).
 
 const (
-// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol
-// ThresholdSignMinSize = MinimumThreshold + 1
-// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol
-// ThresholdSignMaxSize = DKGMaxSize
+	// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol
+	ThresholdSignMinSize = MinimumThreshold + 1
+	// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol
+	ThresholdSignMaxSize = DKGMaxSize
 )
 
 // ThresholdSignatureInspector is an inspector of the threshold signature protocol.

From 8b49d0ac6979a0321ce4db21db15050f44819301 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 27 Feb 2023 13:37:21 -0600
Subject: [PATCH 013/200] add Fermat inversion - Lagrange interpolation works
 with Euclidean inversion and multiple cleanups

---
 crypto/bls12381_hashtocurve.c    |  12 +---
 crypto/bls12381_utils.c          | 102 +++++++++++++++++++++++++++----
 crypto/bls12381_utils.h          |  21 ++++---
 crypto/bls_core.c                |   2 -
 crypto/bls_thresholdsign_core.c  |  54 ++++------------
 crypto/bls_thresholdsign_test.go |  35 +++++++++--
 crypto/blst_include.h            |   6 --
 crypto/blst_tools.c              |  24 --------
 8 files changed, 147 insertions(+), 109 deletions(-)
 delete mode 100644 crypto/blst_tools.c

diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c
index 62053d7ed22..3e8217d42e5 100644
--- a/crypto/bls12381_hashtocurve.c
+++ b/crypto/bls12381_hashtocurve.c
@@ -10,7 +10,7 @@ extern prec_st* bls_prec;
 // These constants are taken from https://github.com/kwantam/bls12-381_hash 
 // and converted to the Mongtomery domain. 
 // Copyright 2019 Riad S. Wahby
-const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS] = {
+const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS] = {
     {0x4d18b6f3af00131c, 0x19fa219793fee28c, 0x3f2885f1467f19ae,
      0x23dcea34f2ffb304, 0xd15b58d2ffc00054, 0x0913be200a20bef4,},
     {0x898985385cdbbd8b, 0x3c79e43cc7d966aa, 0x1597e193f4cd233a,
@@ -37,7 +37,7 @@ const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS] = {
      0x464170142a1009eb, 0xb14f01aadb30be2f, 0x18ae6a856f40715d,},
 };
 
-const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_DIGITS] = {
+const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS] = {
     {0x2b567ff3e2837267, 0x1d4d9e57b958a767, 0xce028fea04bd7373,
      0xcc31a30a0b6cd3df, 0x7d7b18a682692693, 0x0d300744d42a0310,},
     {0x99c2555fa542493f, 0xfe7f53cc4874f878, 0x5df0608b8f97608a,
@@ -335,12 +335,4 @@ void map_to_G1(ep_t h, const byte* data, const int len) {
     #elif hashToPoint==RELIC_SSWU
     ep_map_from_field(h, data, len);
     #endif
-
-    /*Fr a, b;
-    Fr_set_limb(&a, 1);
-    Fr_print_("a", &a);
-    Fr_inv_montg_eucl(&b,&a);
-    Fr_print_("b", &b);
-    Fr_from_montg(&b, &b);
-    Fr_print_("b", &b); */  
 }
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index c8dfb808827..dc8b642de66 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -24,6 +24,16 @@ int get_Fr_BYTES() {
 }
 
 // Fr utilities
+// Montgomery constant R related to the curve order r
+const Fr BLS12_381_rR = (Fr){  /* (1<<256)%r */
+    TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5),
+    TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe)
+};
+
+/*0x1824b159acc5056f
+0x998c4fefecbc4ff5
+0x5884b7fa00034802
+0x00000001fffffffe*/
 
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
@@ -51,7 +61,7 @@ void Fr_set_limb(Fr* a, const limb_t l){
     *((limb_t*)a) = l;
 }
 
-void Fr_copy(Fr* res, Fr* a) {
+void Fr_copy(Fr* res, const Fr* a) {
     vec_copy((byte*)res, (byte*)a, Fr_BYTES);
 }
 
@@ -72,19 +82,26 @@ void Fr_neg(Fr *res, const Fr *a) {
     cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r);
 }
 
+// res = a*b*R^(-1)
 void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) {
     mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0);
 }
 
+// res = a^2 * R^(-1)
+void Fr_squ_montg(Fr *res, const Fr *a) {
+    sqr_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0);
+}
+
+// res = a*R
 void Fr_to_montg(Fr *res, const Fr *a) {
     mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0);
 }
 
+// res = a*R^(-1)
 void Fr_from_montg(Fr *res, const Fr *a) {
     from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0);
 }
 
-// result is in Montgomery form
 // res = a^(-1)*R
 void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
     // copied and modified from BLST code
@@ -98,8 +115,49 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
     redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0);
 }
 
-void Fr_inv_montg_expo(Fr *res, const Fr *a) {
-    // TODO:
+// result is in Montgomery form if base is in montgomery form
+// if base = b*R, res = b^expo * R
+// In general, res = base^expo * R^(-expo+1)
+// `expo` is encoded as a little-endian limb_t table of length `expo_len`.
+void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) {
+    // mask of the most significant bit
+	const limb_t msb_mask =  (limb_t)1<<((sizeof(limb_t)<<3)-1);
+	limb_t mask = msb_mask;
+	int index = 0;
+
+    expo += expo_len;
+	// Treat most significant zero limbs
+	while((index < expo_len) && (*(--expo) == 0)) {
+		index++;
+    }
+	// Treat the most significant zero bits
+	while((*expo & mask) == 0) {
+		mask >>= 1;
+    }
+	// Treat the first `1` bit
+	Fr_copy(res, base);
+	mask >>= 1;
+	// Scan all limbs of the exponent
+	for ( ; index < expo_len; expo--) {
+		// Scan all bits 
+		for ( ; mask != 0 ; mask >>= 1 ) {
+			// square
+			Fr_squ_montg(res, res);
+			// multiply
+			if (*expo & mask) {
+				Fr_mul_montg(res, res ,base);
+			}
+		}
+		mask = msb_mask;
+        index++;
+	}
+}
+
+void Fr_inv_exp_montg(Fr *res, const Fr *a) {
+    Fr r_2;
+    Fr_copy(&r_2, (Fr*)BLS12_381_r);
+    r_2.limbs[0] -= 2;
+    Fr_exp_montg(res, a, (limb_t*)&r_2, 4);
 }
 
 // computes the sum of the array elements and writes the sum in jointx
@@ -110,6 +168,24 @@ void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) {
     }
 }
 
+// internal type of BLST `pow256` uses bytes little endian.
+// input is bytes big endian as used by Flow crypto lib external scalars.
+static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
+{
+    unsigned char* b = (unsigned char*)a + Fr_BYTES - 1;
+    if ((uptr_t)ret == (uptr_t)a) { // swap in place
+        for (int i=0; i<Fr_BYTES/2; i++) {
+            unsigned char tmp = *ret;
+            *(ret++) = *b;
+            *(b--) = tmp;
+        }
+        return;
+    }
+    for (int i=0; i<Fr_BYTES; i++) {
+        *(ret++) = *(b--);
+    }
+}
+
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
 // input bytes are big endian.
 // returns:
@@ -195,12 +271,12 @@ prec_st* bls_prec = NULL;
 
 // required constants for the optimized SWU hash to curve
 #if (hashToPoint == LOCAL_SSWU)
-extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS];
-extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_DIGITS];
+extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS];
+extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS];
 #endif
 
 #if (MEMBERSHIP_CHECK_G1 == BOWE)
-extern const uint64_t beta_data[Fp_DIGITS];
+extern const uint64_t beta_data[Fp_LIMBS];
 extern const uint64_t z2_1_by3_data[2];
 #endif
 
@@ -211,7 +287,7 @@ void precomputed_data_set(const prec_st* p) {
 
 // Reads a prime field element from a digit vector in big endian format.
 // There is no conversion to Montgomery domain in this function.
- #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_DIGITS)
+ #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
 
 // pre-compute some data required for curve BLS12-381
 prec_st* init_precomputed_data_BLS12_381() {
@@ -238,7 +314,7 @@ prec_st* init_precomputed_data_BLS12_381() {
 
     #if (MEMBERSHIP_CHECK_G1 == BOWE)
     bn_new(&bls_prec->beta);
-    bn_read_raw(&bls_prec->beta, beta_data, Fp_DIGITS);
+    bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS);
     bn_new(&bls_prec->z2_1_by3);
     bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2);
     #endif
@@ -329,8 +405,8 @@ void bytes_print_(char* s, byte* data, int len) {
 
 void Fr_print_(char* s, Fr* a) {
     printf("[%s]:\n", s);
-    limb_t* p = (limb_t*)(a) + Fr_DIGITS;
-    for (int i=0; i<Fr_DIGITS; i++) 
+    limb_t* p = (limb_t*)(a) + Fr_LIMBS;
+    for (int i=0; i<Fr_LIMBS; i++) 
         printf("%16llx", *(--p));
     printf("\n");
 }
@@ -832,7 +908,7 @@ int simple_subgroup_check_G2(const ep2_t p){
 #if (MEMBERSHIP_CHECK_G1 == BOWE)
 // beta such that beta^3 == 1 mod p
 // beta is in the Montgomery form
-const uint64_t beta_data[Fp_DIGITS] = { 
+const uint64_t beta_data[Fp_LIMBS] = { 
     0xcd03c9e48671f071, 0x5dab22461fcda5d2, 0x587042afd3851b95,
     0x8eb60ebe01bacb9e, 0x03f97d6e83d050d2, 0x18f0206554638741,
 };
@@ -849,7 +925,7 @@ int bowe_subgroup_check_G1(const ep_t p){
     if (ep_is_infty(p) == 1) 
         return VALID;
     fp_t b;
-    dv_copy(b, beta_data, Fp_DIGITS); 
+    dv_copy(b, beta_data, Fp_LIMBS); 
     ep_t sigma, sigma2, p_inv;
     ep_new(sigma);
     ep_new(sigma2);
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 63fad43adc4..4939e15135e 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -15,20 +15,20 @@
 #define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR
 
 #define BITS_TO_BYTES(x) ((x+7)>>3)
-#define BITS_TO_DIGITS(x) ((x+63)>>6)
-#define BYTES_TO_DIGITS(x) ((x+7)>>3)
-#define DIGITS_TO_BYTES(x) ((x)<<3)
+#define BITS_TO_LIMBS(x) ((x+63)>>6)
+#define BYTES_TO_LIMBS(x) ((x+7)>>3)
+#define LIMBS_TO_BYTES(x) ((x)<<3)
 #define MIN(a,b) ((a)>(b)?(b):(a))
 
 // Fields and Group serialization lengths
 #define SEC_BITS  128
 #define Fp_BITS   381
 #define Fp2_BYTES (2*Fp_BYTES)
-#define Fp_DIGITS BITS_TO_DIGITS(Fp_BITS)
-#define Fp_BYTES  DIGITS_TO_BYTES(Fp_DIGITS) // BLST implements Fp as a limb array
+#define Fp_LIMBS  BITS_TO_LIMBS(Fp_BITS)
+#define Fp_BYTES  LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array
 #define Fr_BITS   255
-#define Fr_DIGITS BITS_TO_DIGITS(Fr_BITS)
-#define Fr_BYTES  DIGITS_TO_BYTES(Fr_DIGITS) // BLST implements Fr as a limb array
+#define Fr_LIMBS  BITS_TO_LIMBS(Fr_BITS)
+#define Fr_BYTES  LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array
 
 #define G1_BYTES (2*Fp_BYTES)
 #define G2_BYTES (2*Fp2_BYTES)
@@ -92,20 +92,23 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
 void     map_to_G1(ep_t, const byte*, const int);
 
 // Fr utilities
+extern const Fr BLS12_381_rR;
 bool_t      Fr_is_zero(const Fr* a);
 bool_t      Fr_is_equal(const Fr* a, const Fr* b);
 void        Fr_set_limb(Fr*, const limb_t);
-void        Fr_copy(Fr*, Fr*);
+void        Fr_copy(Fr*, const Fr*);
 void        Fr_set_zero(Fr*);
 void        Fr_add(Fr *res, const Fr *a, const Fr *b);
 void        Fr_sub(Fr *res, const Fr *a, const Fr *b);
 void        Fr_neg(Fr *res, const Fr *a);
 void        Fr_sum_vector(Fr*, const Fr x[], const int);
 void        Fr_mul_montg(Fr *res, const Fr *a, const Fr *b);
+void        Fr_squ_montg(Fr *res, const Fr *a);
 void        Fr_to_montg(Fr *res, const Fr *a);
 void        Fr_from_montg(Fr *res, const Fr *a);
+void        Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len);
 void        Fr_inv_montg_eucl(Fr *res, const Fr *a);
-void        Fr_inv_montg_expo(Fr *res, const Fr *a);
+void        Fr_inv_exp_montg(Fr *res, const Fr *a);
 BLST_ERROR  Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
 BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
 void        Fr_write_bytes(uint8_t *bin, const Fr* a);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index ecec6ae346c..a1d47c73f17 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -108,14 +108,12 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i
 
     fp12_t pair;
     fp12_new(&pair);
-    if (core_get()->code != RLC_OK) printf("EUUUUUUUU\n");
     // double pairing with Optimal Ate 
     pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2);
 
     // compare the result to 1
     int res = fp12_cmp_dig(pair, 1);
 
-
 #elif SINGLE_PAIRING   
     fp12_t pair1, pair2;
     fp12_new(&pair1); fp12_new(&pair2);
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 68f5005ace4..e6f94716d9b 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -16,23 +16,16 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind
     Fr denominator; // eventually would represent D*R^k 
 
     // Initialize N and D to Montgomery constant R
-    // TODO: hardcode R and add Fr_copy function
-    Fr_copy(&numerator, (Fr*)BLS12_381_rRR);
-    Fr_copy(&denominator, (Fr*)BLS12_381_rRR);
-    Fr_from_montg(&numerator, &numerator);
-    Fr_from_montg(&denominator, &denominator);
+    // TODO: hardcode R
+    Fr_copy(&numerator, &BLS12_381_rR);
+    Fr_copy(&denominator, &BLS12_381_rR);
 
-    // sign of D: 1 for positive and 0 for negative
-    int sign = 1; 
+    // sign of D: 0 for positive and 1 for negative
+    int sign = 0; 
 
     // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS)
     // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing.
     #define MAX_IND_LOOPS   64/MAX_IND_BITS
-
-    // choose inversion algorithm used for denominator
-    #define FERMAT_INVERSION 0
-    #define EUCLIDEAN_INVERSION (FERMAT_INVERSION^1)
-
     const int loops = MAX_IND_LOOPS;
     int k,j = 0;
     Fr tmp;
@@ -50,47 +43,26 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind
             }
             limb_numerator *= indices[j];
         }
+        // numerator and denominator are both computed in Montgomery form.
         // update numerator
         Fr_set_limb(&tmp, limb_numerator); // L_N
-        #if EUCLIDEAN_INVERSION == 1 
-            // numerator and denominator are both computed in Montgomery form.
-            Fr_to_montg(&tmp, &tmp);  // L_N*R
-        #endif
+        Fr_to_montg(&tmp, &tmp);  // L_N*R
         Fr_mul_montg(&numerator, &numerator, &tmp); // N*R
         // update denominator
         Fr_set_limb(&tmp, limb_denominator); // L_D
-        #if EUCLIDEAN_INVERSION == 1 
-            // keep numertaor and denominator are both computed in Montgomery form.
-            Fr_to_montg(&tmp, &tmp);  // L_D*R
-        #endif
+        Fr_to_montg(&tmp, &tmp);  // L_D*R
         Fr_mul_montg(&denominator, &denominator, &tmp); // D*R
-        //printf("%d--%lld--%lld\n", sign, limb_numerator, limb_denominator);
     }
-    if (!sign) {
+    if (sign) {
         Fr_neg(&denominator, &denominator);
     }
 
-    #if EUCLIDEAN_INVERSION == 1 
-        // at this point, denominator = D*R , numertaor = N*R
-        // inversion  
-        Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1)
-        Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1)     
-    #endif
-
-    //printf("%d:LI(%d):\n", i, indices[i]);
-    //Fr_print_("res", res);
-
-     #if FERMAT_INVERSION == 1 
-        // at this point, denominator = D*R^c , numertaor = N*R^c  
-        // (c is the nummber of mult_mont, but the exact value isn't relevant)
-        // inversion inv(xR) = x^(-1)R
-        Fr_inv_montg_expo(&denominator, &denominator); // inv(D*R^c) = inv(D*R^(c-1)*R) = D^(-1)*R^(1-c)*R
-        Fr_mul_montg(&numerator, &numerator, &denominator); //N*D^(-1)*R
-        Fr_from_montg(res, &numerator); //N*D^(-1)
-    #endif
+    // at this point, denominator = D*R , numertaor = N*R
+    // inversion inv(x) = x^(-1)R
+    Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1)
+    Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1)     
 }
 
-
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
 // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 3d05177369c..04603a70a55 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -546,7 +546,7 @@ func testCentralizedStatelessAPI(t *testing.T) {
 	n := 10
 	for threshold := MinimumThreshold; threshold < n; threshold++ {
 		// generate threshold keys
-		r := int64(1677308758239641000) //time.Now().UnixNano()
+		r := time.Now().UnixNano()
 		mrand.Seed(r)
 		t.Log(r)
 		seed := make([]byte, SeedMinLenDKG)
@@ -607,7 +607,7 @@ func testCentralizedStatelessAPI(t *testing.T) {
 	}
 }
 
-/*func BenchmarkSimpleKeyGen(b *testing.B) {
+func BenchmarkSimpleKeyGen(b *testing.B) {
 	n := 60
 	seed := make([]byte, SeedMinLenDKG)
 	_, _ = rand.Read(seed)
@@ -616,6 +616,33 @@ func testCentralizedStatelessAPI(t *testing.T) {
 		_, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed)
 	}
 	b.StopTimer()
-}*/
+}
 
-// TODO: add benchmark for signature reconstruction
+func BenchmarkSignatureReconstruction(b *testing.B) {
+	n := 60
+	seed := make([]byte, SeedMinLenDKG)
+	_, _ = rand.Read(seed)
+	threshold := 40
+	// generate threshold keys
+	skShares, _, _, err := BLSThresholdKeyGen(n, threshold, seed)
+	require.NoError(b, err)
+	// signature hasher
+	kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag)
+	// generate signature shares
+	signShares := make([]Signature, 0, threshold+1)
+	signers := make([]int, 0, threshold+1)
+	// create (t+1) signatures of the first randomly chosen signers
+	for i := 0; i < threshold+1; i++ {
+		signers = append(signers, i)
+		share, err := skShares[i].Sign(thresholdSignatureMessage, kmac)
+		require.NoError(b, err)
+		signShares = append(signShares, share)
+	}
+	// reconstruct
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers)
+		require.NoError(b, err)
+	}
+	b.StopTimer()
+}
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 0fd710d1579..d7a7cdf4367 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -24,10 +24,4 @@ typedef POINTonE1 G1;
 // Subroup G1 in E2
 typedef POINTonE2 G2;
 
-
-// extra functions and tools that are needed by the Flow crypto library 
-// and that are not exported in the desired form by BLST
-
-void pow256_from_be_bytes(pow256 ret, const unsigned char a[32]);
-
 #endif
\ No newline at end of file
diff --git a/crypto/blst_tools.c b/crypto/blst_tools.c
deleted file mode 100644
index 81fba31ac9e..00000000000
--- a/crypto/blst_tools.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// +build relic
-
-// extra tools to use BLST low level that are needed by the Flow crypto library
-
-#include "blst_include.h"
-#include "bls12381_utils.h"
-
-// internal type of BLST `pow256` uses bytes little endian.
-// input is bytes big endian as used by Flow crypto lib external scalars.
-void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
-{
-    unsigned char* b = (unsigned char*)a + Fr_BYTES - 1;
-    if ((uptr_t)ret == (uptr_t)a) { // swap in place
-        for (int i=0; i<Fr_BYTES/2; i++) {
-            unsigned char tmp = *ret;
-            *(ret++) = *b;
-            *(b--) = tmp;
-        }
-        return;
-    }
-    for (int i=0; i<Fr_BYTES; i++) {
-        *(ret++) = *(b--);
-    }
-}
\ No newline at end of file

From 1b3293fc3626a7ff88d0088eaa666b37879d782e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 27 Feb 2023 14:37:28 -0600
Subject: [PATCH 014/200] DKG feldmanVSS with Fr type

---
 crypto/bls.go             | 18 ++++--------------
 crypto/bls12381_utils.go  | 24 +++++++++++++++++++++++-
 crypto/dkg_core.c         | 28 +++++++++++-----------------
 crypto/dkg_feldmanvss.go  | 18 +++++++-----------
 crypto/dkg_feldmanvssq.go | 24 +++++++++---------------
 crypto/dkg_include.h      |  4 ++--
 crypto/dkg_test.go        | 35 +++++++++++++++++------------------
 7 files changed, 73 insertions(+), 78 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 48996e0ae9d..447ba6f532e 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -325,22 +325,12 @@ func BLSInvalidSignature() Signature {
 func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) {
 	sk := newPrKeyBLSBLS12381(nil)
 
-	read := C.Fr_star_read_bytes(
-		(*C.Fr)(&sk.scalar),
-		(*C.uchar)(&privateKeyBytes[0]),
-		(C.int)(prKeyLengthBLSBLS12381))
+	err := readScalarFrStar(&sk.scalar, privateKeyBytes)
 
-	switch int(read) {
-	case blst_valid:
-		return sk, nil
-	case blst_bad_encoding:
-		return nil, invalidInputsErrorf("input length must be %d, got %d",
-			prKeyLengthBLSBLS12381, len(privateKeyBytes))
-	case blst_bad_scalar:
-		return nil, invalidInputsErrorf("the private key is not in the correct range for the BLS12-381 curve")
-	default:
-		return nil, invalidInputsErrorf("reading the private key failed")
+	if err != nil {
+		return nil, fmt.Errorf("failed to read the private key: %w", err)
 	}
+	return sk, nil
 }
 
 // decodePublicKey decodes a slice of bytes into a public key.
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index d569bf0cc38..cbdc718e364 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -159,7 +159,7 @@ func mapToZr(x *scalar, src []byte) bool {
 	return false
 }
 
-// writeScalar writes a G2 point in a slice of bytes
+// writeScalar writes a scalar in a slice of bytes
 func writeScalar(dest []byte, x *scalar) {
 	C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x))
 }
@@ -184,6 +184,28 @@ func writePointG1(dest []byte, a *pointG1) {
 	)
 }
 
+// read an Fr* element from a byte slice
+// and stores it into a `scalar` type element.
+func readScalarFrStar(a *scalar, src []byte) error {
+	read := C.Fr_star_read_bytes(
+		(*C.Fr)(a),
+		(*C.uchar)(&src[0]),
+		(C.int)(len(src)))
+
+	switch int(read) {
+	case blst_valid:
+		return nil
+	case blst_bad_encoding:
+		return invalidInputsErrorf("input length must be %d, got %d",
+			frBytesLen, len(src))
+	case blst_bad_scalar:
+		return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve")
+	default:
+		return invalidInputsErrorf("reading the scalar failed")
+	}
+
+}
+
 // readPointG2 reads a G2 point from a slice of bytes
 // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index fa4729c84e2..34d6addbffb 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -6,17 +6,17 @@
 #define N_max 250
 #define N_bits_max 8  // log(250)  
 #define T_max  ((N_max-1)/2)
-/*
+
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r)
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
-void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){
+void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x){
     Fr image;
     Fr_polynomialImage(&image, y, a, a_size, x);
     // exports the result
     Fr_write_bytes(out, &image);
-}*/
+}
 
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
 // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255.
@@ -42,8 +42,7 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 // and stores the point in y
 // r is the order of G2
-static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A,
-         const byte x, const bn_t r){
+static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, const byte x){
     
     bn_t bn_x;        
     bn_new(bn_x);    
@@ -54,24 +53,18 @@ static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A,
         ep2_add_projc(y, y, (ep2_st*)&A[i]);
     }
 
-    ep2_norm(y, y); // not necessary but left here to optimize the 
+    ep2_norm(y, y); // not necessary but called to optimize the 
                     // multiple pairing computations with the same public key
     bn_free(bn_x);
 }
 
-// compute the participants public keys from the verification vector
-// y[i] = Q(i+1) for all participants i, with:
-// Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
+// computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
+// where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
 void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int len_A) {
-    // order r
-    bn_t r;
-    bn_new(r); 
-    g2_get_ord(r);
     for (byte i=0; i<len_y; i++) {
         //y[i] = Q(i+1)
-        G2_polynomialImage(y+i , A, len_A, i+1, r);
+        G2_polynomialImage(y+i , A, len_A, i+1);
     }
-    bn_free(r);
 }
 
 // export an array of ep2_st into an array of bytes
@@ -104,10 +97,11 @@ int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len){
 
 // returns 1 if g2^x = y, where g2 is the generator of G2
 // returns 0 otherwise
-int verifyshare(const bn_t x, const ep2_t y) {
+int verifyshare(const Fr* x, const ep2_t y) {
     ep2_t res;
     ep2_new(res);
-    g2_mul_gen(res, (bn_st*)x);
+    bn_st* x_tmp = Fr_blst_to_relic(x);
+    g2_mul_gen(res, x_tmp);
     return (ep2_cmp(res, (ep2_st*)y) == RLC_EQ);
 }
 
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 221253168cd..e80dc8d71e6 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -287,10 +286,10 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 		if i-1 == s.myIndex {
 			xdata := make([]byte, shareSize)
 			zrPolynomialImage(xdata, s.a, i, &s.y[i-1])
-			C.bn_read_bin((*C.Fr)(&s.x),
-				(*C.uchar)(&xdata[0]),
-				PrKeyLenBLSBLS12381,
-			)
+			err := readScalarFrStar(&s.x, xdata)
+			if err != nil {
+				return fmt.Errorf("unexpected error when generating the dealer's own share: %w", err)
+			}
 			continue
 		}
 		// the-other-participant shares
@@ -349,13 +348,11 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) {
 	}
 
 	// read the participant private share
-	if C.Fr_read_bytes((*C.Fr)(&s.x),
-		(*C.uchar)(&data[0]),
-		PrKeyLenBLSBLS12381,
-	) != valid {
+	err := readScalarFrStar(&s.x, data)
+	if err != nil {
 		s.validKey = false
 		s.processor.FlagMisbehavior(int(origin),
-			fmt.Sprintf("invalid share value %x", data))
+			fmt.Sprintf("invalid share value %x: %s", data, err))
 		return
 	}
 
@@ -454,4 +451,3 @@ func (s *feldmanVSSstate) computePublicKeys() {
 		(*C.ep2_st)(&s.vA[0]), (C.int)(len(s.vA)),
 	)
 }
-*/
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 2e7688b11fa..53a1dc4278b 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -404,13 +404,11 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) {
 		return
 	}
 	// read the participant private share
-	if C.Fr_read_bytes((*C.Fr)(&s.x),
-		(*C.uchar)(&data[0]),
-		PrKeyLenBLSBLS12381,
-	) != valid {
+	err := readScalarFrStar(&s.x, data)
+	if err != nil {
 		s.buildAndBroadcastComplaint()
 		s.processor.FlagMisbehavior(int(origin),
-			fmt.Sprintf("invalid share value %x", data))
+			fmt.Sprintf("invalid share value %x: %s", data, err))
 		return
 	}
 
@@ -626,13 +624,11 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 
 		// read the complainer private share
-		if C.Fr_read_bytes((*C.Fr)(&s.complaints[complainer].answer),
-			(*C.uchar)(&data[1]),
-			PrKeyLenBLSBLS12381,
-		) != valid {
+		err := readScalarFrStar(&s.complaints[complainer].answer, data[1])
+		if err != nil {
 			s.disqualified = true
 			s.processor.Disqualify(int(s.dealerIndex),
-				fmt.Sprintf("invalid complaint answer value %x", data))
+				fmt.Sprintf("invalid complaint answer value %x: %s", data, err))
 			return
 		}
 		return
@@ -649,13 +645,11 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 	// flag check is a sanity check
 	if c.received {
 		// read the complainer private share
-		if C.Fr_read_bytes((*C.Fr)(&c.answer),
-			(*C.uchar)(&data[1]),
-			PrKeyLenBLSBLS12381,
-		) != valid {
+		err := readScalarFrStar(&c.answer, data[1])
+		if err != nil {
 			s.disqualified = true
 			s.processor.Disqualify(int(s.dealerIndex),
-				fmt.Sprintf("invalid complaint answer value %x", data))
+				fmt.Sprintf("invalid complaint answer value %x: %s", data, err))
 			return
 		}
 		if s.vAReceived {
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 34c81053fa7..07a8da234cf 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,11 +5,11 @@
 
 #include "bls12381_utils.h"
 
-void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x);
+void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x);
 void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
 void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A);
 void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len);
 int  ep2_vector_read_bin(ep2_st* A, const byte* src, const int len);
-int  verifyshare(const bn_t x, const ep2_t y);
+int  verifyshare(const Fr* x, const ep2_t y);
 
 #endif
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index fc1de49d225..d3347df0c93 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -8,21 +8,20 @@ import (
 	mrand "math/rand"
 	"sync"
 	"testing"
-	_ "time"
+	"time"
 
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
-	_ "github.com/stretchr/testify/require"
+	"github.com/stretchr/testify/require"
 )
 
 var gt *testing.T
 
-/*
 func TestDKG(t *testing.T) {
 	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
-	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
-	t.Run("JointFeldman", testJointFeldman)
-}*/
+	//t.Run("FeldmanVSSQual", testFeldmanVSSQual)
+	//t.Run("JointFeldman", testJointFeldman)
+}
 
 // optimal threshold (t) to allow the largest number of malicious participants (m)
 // assuming the protocol requires:
@@ -33,7 +32,6 @@ func optimalThreshold(size int) int {
 	return (size - 1) / 2
 }
 
-/*
 // Testing the happy path of Feldman VSS by simulating a network of n participants
 func testFeldmanVSSSimple(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
@@ -44,7 +42,7 @@ func testFeldmanVSSSimple(t *testing.T) {
 			dkgCommonTest(t, feldmanVSS, n, threshold, happyPath)
 		})
 	}
-}*/
+}
 
 type testCase int
 
@@ -135,12 +133,12 @@ func testJointFeldman(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages)
 	})
 }
-
+*/
 // Supported Key Generation protocols
 const (
 	feldmanVSS = iota
-	feldmanVSSQual
-	jointFeldman
+	/*feldmanVSSQual
+	jointFeldman*/
 )
 
 func newDKG(dkg int, size int, threshold int, myIndex int,
@@ -148,10 +146,10 @@ func newDKG(dkg int, size int, threshold int, myIndex int,
 	switch dkg {
 	case feldmanVSS:
 		return NewFeldmanVSS(size, threshold, myIndex, processor, dealerIndex)
-	case feldmanVSSQual:
+	/*case feldmanVSSQual:
 		return NewFeldmanVSSQual(size, threshold, myIndex, processor, dealerIndex)
 	case jointFeldman:
-		return NewJointFeldman(size, threshold, myIndex, processor)
+		return NewJointFeldman(size, threshold, myIndex, processor)*/
 	default:
 		return nil, fmt.Errorf("non supported protocol")
 	}
@@ -173,11 +171,12 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 
 	// number of dealers in the protocol
 	var dealers int
-	if dkg == jointFeldman {
+	/*if dkg == jointFeldman {
 		dealers = n
 	} else {
 		dealers = 1
-	}
+	}*/
+	dealers = 1
 
 	// create n processors for all participants
 	processors := make([]testDKGProcessor, 0, n)
@@ -349,8 +348,8 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 		assert.Equal(t, expected, processors[i].disqualified)
 	}
 	// check if DKG is successful
-	if (dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) ||
-		(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer
+	if false { //(dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) ||
+		//(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer
 		t.Logf("dkg failed, there are %d disqualified participants\n", r1)
 		// DKG failed, check for final errors
 		for i := r1; i < n; i++ {
@@ -442,7 +441,7 @@ func timeoutPostProcess(processors []testDKGProcessor, t *testing.T, phase int)
 			}(i)
 		}
 	}
-}*/
+}
 
 // implements DKGProcessor interface
 type testDKGProcessor struct {

From 2bbea260fb2d16a225da3549263353755a401e46 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 27 Feb 2023 19:36:52 -0600
Subject: [PATCH 015/200] enable all DKG protocols to work with new Fr type

---
 crypto/bls_thresholdsign_test.go |  8 ++++----
 crypto/dkg_feldmanvssq.go        | 10 ++++------
 crypto/dkg_jointfeldman.go       |  5 ++---
 crypto/dkg_test.go               | 22 +++++++++++-----------
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 04603a70a55..5473b454827 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -21,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) {
 	t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI)
 	// stateful API
 	t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI)
-	//t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
-	//t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
+	t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
+	t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
 }
 
 const thresholdSignatureTag = "random tag"
@@ -314,7 +314,7 @@ func testCentralizedStatefulAPI(t *testing.T) {
 
 // Distributed Threshold Signature stateful api test
 // keys are generated using simple Feldman VSS
-/*func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) {
+func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
 	log.Info("DKG starts")
 	gt = t
@@ -439,7 +439,7 @@ func testDistributedStatefulAPI_JointFeldman(t *testing.T) {
 		// synchronize the main thread to end TS
 		sync.Wait()
 	}
-}*/
+}
 
 // This is a testing function
 // It simulates processing incoming messages by a participant during DKG
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 53a1dc4278b..cc0b94962df 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -203,14 +202,13 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error)
 		return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid")
 	}
 	return x, Y, y, nil
-}*/
+}
 
 const (
 	complaintSize       = 1
 	complaintAnswerSize = 1 + PrKeyLenBLSBLS12381
 )
 
-/*
 // HandleBroadcastMsg processes a new broadcasted message received by the current participant.
 // orig is the message origin index
 //
@@ -624,7 +622,7 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 
 		// read the complainer private share
-		err := readScalarFrStar(&s.complaints[complainer].answer, data[1])
+		err := readScalarFrStar(&s.complaints[complainer].answer, data[1:])
 		if err != nil {
 			s.disqualified = true
 			s.processor.Disqualify(int(s.dealerIndex),
@@ -645,7 +643,7 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 	// flag check is a sanity check
 	if c.received {
 		// read the complainer private share
-		err := readScalarFrStar(&c.answer, data[1])
+		err := readScalarFrStar(&c.answer, data[1:])
 		if err != nil {
 			s.disqualified = true
 			s.processor.Disqualify(int(s.dealerIndex),
@@ -666,4 +664,4 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 			s.x = c.answer
 		}
 	}
-}*/
+}
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index be8b2c9f70f..bef857fba37 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
@@ -203,7 +202,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) {
 	jointx, jointPublicKey, jointy := s.sumUpQualifiedKeys(s.size - disqualifiedTotal)
 
 	// private key of the current participant
-	x := newPrKeyBLSBLS12381(&jointx)
+	x := newPrKeyBLSBLS12381(jointx)
 
 	// Group public key
 	Y := newPubKeyBLSBLS12381(jointPublicKey)
@@ -338,4 +337,4 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2
 		}
 	}
 	return qualifiedx, qualifiedPubKey, qualifiedy
-}*/
+}
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index d3347df0c93..0d32a3fd1ec 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -19,7 +19,7 @@ var gt *testing.T
 
 func TestDKG(t *testing.T) {
 	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
-	//t.Run("FeldmanVSSQual", testFeldmanVSSQual)
+	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
 	//t.Run("JointFeldman", testJointFeldman)
 }
 
@@ -67,7 +67,7 @@ const (
 	invalidSharesComplainTrigger
 	invalidComplaintAnswerBroadcast
 	duplicatedSendAndBroadcast
-) /*
+)
 
 // Testing Feldman VSS with the qualification system by simulating a network of n participants
 func testFeldmanVSSQual(t *testing.T) {
@@ -96,6 +96,7 @@ func testFeldmanVSSQual(t *testing.T) {
 	// are only tested within joint feldman.
 }
 
+/*
 // Testing JointFeldman by simulating a network of n participants
 func testJointFeldman(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
@@ -137,8 +138,8 @@ func testJointFeldman(t *testing.T) {
 // Supported Key Generation protocols
 const (
 	feldmanVSS = iota
-	/*feldmanVSSQual
-	jointFeldman*/
+	feldmanVSSQual
+	jointFeldman
 )
 
 func newDKG(dkg int, size int, threshold int, myIndex int,
@@ -146,10 +147,10 @@ func newDKG(dkg int, size int, threshold int, myIndex int,
 	switch dkg {
 	case feldmanVSS:
 		return NewFeldmanVSS(size, threshold, myIndex, processor, dealerIndex)
-	/*case feldmanVSSQual:
+	case feldmanVSSQual:
 		return NewFeldmanVSSQual(size, threshold, myIndex, processor, dealerIndex)
 	case jointFeldman:
-		return NewJointFeldman(size, threshold, myIndex, processor)*/
+		return NewJointFeldman(size, threshold, myIndex, processor)
 	default:
 		return nil, fmt.Errorf("non supported protocol")
 	}
@@ -171,12 +172,11 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 
 	// number of dealers in the protocol
 	var dealers int
-	/*if dkg == jointFeldman {
+	if dkg == jointFeldman {
 		dealers = n
 	} else {
 		dealers = 1
-	}*/
-	dealers = 1
+	}
 
 	// create n processors for all participants
 	processors := make([]testDKGProcessor, 0, n)
@@ -348,8 +348,8 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 		assert.Equal(t, expected, processors[i].disqualified)
 	}
 	// check if DKG is successful
-	if false { //(dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) ||
-		//(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer
+	if (dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) ||
+		(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer
 		t.Logf("dkg failed, there are %d disqualified participants\n", r1)
 		// DKG failed, check for final errors
 		for i := r1; i < n; i++ {

From b113d3362152a2eaf1e97dda025835c32967e0f6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 3 Mar 2023 13:53:22 -0600
Subject: [PATCH 016/200] uncomment tests

---
 crypto/bls_multisig.go    | 20 +++-----------------
 crypto/bls_test.go        | 16 +++++++---------
 crypto/blst_include.h     |  2 +-
 crypto/dkg_feldmanvss.go  |  8 ++++----
 crypto/dkg_feldmanvssq.go |  2 +-
 crypto/dkg_test.go        |  5 ++---
 6 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 297e61267d9..b4fa5918ef7 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -5,11 +5,13 @@ package crypto
 
 import (
 	"errors"
+	"fmt"
 
 	_ "errors"
 
 	_ "fmt"
 
+	"github.com/onflow/flow-go/crypto/hash"
 	_ "github.com/onflow/flow-go/crypto/hash"
 )
 
@@ -41,7 +43,6 @@ import "C"
 // used for signatures.
 var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite)
 
-/*
 // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key.
 //
 // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used
@@ -97,8 +98,6 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) {
 //   - (aggregated_signature, nil) otherwise
 func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 
-
-
 	// check for empty list
 	if len(sigs) == 0 {
 		return nil, blsAggregateEmptyListError
@@ -144,8 +143,6 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 //   - (aggregated_key, nil) otherwise
 func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
 
-
-
 	// check for empty list
 	if len(keys) == 0 {
 		return nil, blsAggregateEmptyListError
@@ -181,8 +178,6 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
 //   - (aggregated_key, nil) otherwise
 func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 
-
-
 	// check for empty list
 	if len(keys) == 0 {
 		return nil, blsAggregateEmptyListError
@@ -203,7 +198,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
 	return sumKey, nil
-}*/
+}
 
 // IdentityBLSPublicKey returns an identity public key which corresponds to the point
 // at infinity in G2 (identity element of G2).
@@ -217,8 +212,6 @@ func IdentityBLSPublicKey() PublicKey {
 	return &identity
 }
 
-/*
-
 // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key.
 //
 // The common use case assumes the aggregated public key was initially formed using
@@ -235,8 +228,6 @@ func IdentityBLSPublicKey() PublicKey {
 //   - (remaining_key, nil) otherwise
 func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) {
 
-
-
 	aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381)
 	if !ok {
 		return nil, notBLSKeyError
@@ -335,8 +326,6 @@ func VerifyBLSSignatureManyMessages(
 	pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher,
 ) (bool, error) {
 
-
-
 	// check signature length
 	if len(s) != signatureLengthBLSBLS12381 {
 		return false, nil
@@ -484,8 +473,6 @@ func BatchVerifyBLSSignaturesOneMessage(
 	pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher,
 ) ([]bool, error) {
 
-
-
 	// empty list check
 	if len(pks) == 0 {
 		return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError)
@@ -549,7 +536,6 @@ func BatchVerifyBLSSignaturesOneMessage(
 
 	return verifBool, nil
 }
-*/
 
 // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys)
 // is empty or nil and thereby represents an invalid input.
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 579700a183e..8aec95a8b03 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -8,7 +8,9 @@ import (
 	"encoding/hex"
 	"fmt"
 	_ "math/rand"
+	mrand "math/rand"
 	"testing"
+	"time"
 	_ "time"
 
 	"github.com/stretchr/testify/assert"
@@ -130,7 +132,7 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
 	})
 
-	/*t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
+	t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
 		data := []byte("random_data")
 		// empty tag hasher
 		sigKmac := NewExpandMsgXOFKMAC128("")
@@ -139,7 +141,7 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		// PoP hasher
 		h2 := popKMAC.ComputeHash(data)
 		assert.NotEqual(t, h1, h2)
-	})*/
+	})
 
 }
 
@@ -215,7 +217,7 @@ func TestBLSUtils(t *testing.T) {
 }
 
 // BLS Proof of Possession test
-/*func TestBLSPOP(t *testing.T) {
+func TestBLSPOP(t *testing.T) {
 	r := time.Now().UnixNano()
 	mrand.Seed(r)
 	t.Logf("math rand seed is %d", r)
@@ -267,8 +269,6 @@ func TestBLSUtils(t *testing.T) {
 	})
 }
 
-
-
 // BLS multi-signature
 // signature aggregation sanity check
 //
@@ -935,7 +935,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		assert.False(t, valid, "verification should fail with nil hasher")
 		inputPks[0] = tmpPK
 	})
-}*/
+}
 
 // TestBLSErrorTypes verifies working of error-type-detecting functions
 // such as `IsInvalidInputsError`.
@@ -963,7 +963,6 @@ func TestBLSErrorTypes(t *testing.T) {
 	})
 }
 
-/*
 // VerifyBLSSignatureManyMessages bench
 // Bench the slowest case where all messages and public keys are distinct.
 // (2*n) pairings without aggrgetion Vs (n+1) pairings with aggregation.
@@ -1059,7 +1058,6 @@ func BenchmarkAggregate(b *testing.B) {
 	})
 }
 
-
 func TestBLSIdentity(t *testing.T) {
 	r := time.Now().UnixNano()
 	mrand.Seed(r)
@@ -1112,4 +1110,4 @@ func TestBLSIdentity(t *testing.T) {
 		assert.NoError(t, err)
 		assert.False(t, valid)
 	})
-}*/
+}
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index d7a7cdf4367..77c06a9e5e5 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -24,4 +24,4 @@ typedef POINTonE1 G1;
 // Subroup G1 in E2
 typedef POINTonE2 G2;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index e80dc8d71e6..d27f68ee45c 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -285,7 +285,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 		// the dealer's own share
 		if i-1 == s.myIndex {
 			xdata := make([]byte, shareSize)
-			zrPolynomialImage(xdata, s.a, i, &s.y[i-1])
+			frPolynomialImage(xdata, s.a, i, &s.y[i-1])
 			err := readScalarFrStar(&s.x, xdata)
 			if err != nil {
 				return fmt.Errorf("unexpected error when generating the dealer's own share: %w", err)
@@ -295,7 +295,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 		// the-other-participant shares
 		data := make([]byte, shareSize+1)
 		data[0] = byte(feldmanVSSShare)
-		zrPolynomialImage(data[1:], s.a, i, &s.y[i-1])
+		frPolynomialImage(data[1:], s.a, i, &s.y[i-1])
 		s.processor.PrivateSend(int(i-1), data)
 	}
 	// broadcast the vector
@@ -401,11 +401,11 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 	}
 }
 
-// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Fr
+// frPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Fr[X]
 // r being the order of G1
 // P(x) is written in dest, while g2^P(x) is written in y
 // x being a small integer
-func zrPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
+func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
 	C.Fr_polynomialImage_export((*C.uchar)(&dest[0]),
 		(*C.ep2_st)(y),
 		(*C.Fr)(&a[0]), (C.int)(len(a)),
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index cc0b94962df..ff9dad35879 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -495,7 +495,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 	data := make([]byte, complaintAnswerSize+1)
 	data[0] = byte(feldmanVSSComplaintAnswer)
 	data[1] = byte(complainee)
-	zrPolynomialImage(data[2:], s.a, complainee+1, nil)
+	frPolynomialImage(data[2:], s.a, complainee+1, nil)
 	s.complaints[complainee].answerReceived = true
 	s.processor.Broadcast(data)
 }
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index 0d32a3fd1ec..da0e05782a0 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -20,7 +20,7 @@ var gt *testing.T
 func TestDKG(t *testing.T) {
 	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
 	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
-	//t.Run("JointFeldman", testJointFeldman)
+	t.Run("JointFeldman", testJointFeldman)
 }
 
 // optimal threshold (t) to allow the largest number of malicious participants (m)
@@ -96,7 +96,6 @@ func testFeldmanVSSQual(t *testing.T) {
 	// are only tested within joint feldman.
 }
 
-/*
 // Testing JointFeldman by simulating a network of n participants
 func testJointFeldman(t *testing.T) {
 	log.SetLevel(log.ErrorLevel)
@@ -134,7 +133,7 @@ func testJointFeldman(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages)
 	})
 }
-*/
+
 // Supported Key Generation protocols
 const (
 	feldmanVSS = iota

From 7a2617c56cf34d46e4875976952d771960d5e74e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 3 Mar 2023 14:44:20 -0600
Subject: [PATCH 017/200] renaming and linter errors

---
 crypto/bls12381_utils.go      |  9 +++------
 crypto/bls12381_utils_test.go |  2 +-
 crypto/bls_thresholdsign.go   | 12 +++++++++---
 crypto/dkg_feldmanvss.go      | 12 +++++++++---
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index cbdc718e364..e2ee855e081 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -122,7 +122,7 @@ func (p *pointG2) isInfinity() bool {
 }
 
 // returns a random element of Fr in input pointer
-func randZr(x *scalar) error {
+func randFr(x *scalar) error {
 	bytes := make([]byte, frBytesLen+securityBits/8)
 	_, err := rand.Read(bytes) // checking one output is enough
 	if err != nil {
@@ -133,7 +133,7 @@ func randZr(x *scalar) error {
 }
 
 // writes a random element of Fr* in input pointer
-func randZrStar(x *scalar) error {
+func randFrStar(x *scalar) error {
 	bytes := make([]byte, frBytesLen+securityBits/8)
 	isZero := true
 	for isZero {
@@ -153,10 +153,7 @@ func mapToZr(x *scalar, src []byte) bool {
 	isZero := C.map_bytes_to_Fr((*C.Fr)(x),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
-	if isZero {
-		return true
-	}
-	return false
+	return bool(isZero)
 }
 
 // writeScalar writes a scalar in a slice of bytes
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 4662aa9567f..e7dba41a8eb 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -19,7 +19,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 	_, _ = rand.Read(seed)
 	_ = seedRelic(seed)
 	var expo scalar
-	randZr(&expo)
+	_ = randFr(&expo)
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index ef4630a7341..e6c21004193 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -564,12 +564,18 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	}
 	// Generate a polynomial P in Fr[X] of degree t
 	a := make([]scalar, threshold+1)
-	randZrStar(&a[0]) // non-identity key
+	if err := randFrStar(&a[0]); err != nil { // non-identity key
+		return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
+	}
 	if threshold > 0 {
 		for i := 1; i < threshold; i++ {
-			randZr(&a[i])
+			if err := randFr(&a[i]); err != nil {
+				return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
+			}
+		}
+		if err := randFrStar(&a[threshold]); err != nil { // enforce the polynomial degree
+			return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
 		}
-		randZrStar(&a[threshold]) // enforce the polynomial degree
 	}
 	// compute the shares
 	for i := index(1); int(i) <= size; i++ {
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index d27f68ee45c..5db62e8672c 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -268,15 +268,21 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 	s.vA = make([]pointG2, s.threshold+1)
 	s.y = make([]pointG2, s.size)
 	// non-zero a[0] - group private key is not zero
-	randZrStar(&s.a[0])
+	if err := randFrStar(&s.a[0]); err != nil {
+		return fmt.Errorf("generating the polynomial failed: %w", err)
+	}
 	generatorScalarMultG2(&s.vA[0], &s.a[0])
 	if s.threshold > 0 {
 		for i := 1; i < s.threshold; i++ {
-			randZr(&s.a[i])
+			if err := randFr(&s.a[i]); err != nil {
+				return fmt.Errorf("generating the polynomial failed: %w", err)
+			}
 			generatorScalarMultG2(&s.vA[i], &s.a[i])
 		}
 		// non-zero a[t] to enforce the polynomial degree
-		randZrStar(&s.a[s.threshold])
+		if err := randFrStar(&s.a[s.threshold]); err != nil {
+			return fmt.Errorf("generating the polynomial failed: %w", err)
+		}
 		generatorScalarMultG2(&s.vA[s.threshold], &s.a[s.threshold])
 	}
 

From d7f3d5d5a531a773eadc2ceffc06145b6244c0be Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 3 Mar 2023 17:29:50 -0600
Subject: [PATCH 018/200] fix gcc compilation issue and remove blst.h

---
 crypto/bls12381_utils.c         |   2 +-
 crypto/bls12381_utils.h         |   2 +-
 crypto/bls_thresholdsign_core.c |   5 +-
 crypto/blst_include.h           |  56 +++-
 crypto/blst_src/blst.h          | 483 --------------------------------
 crypto/blst_src/blst_aux.h      | 102 -------
 6 files changed, 59 insertions(+), 591 deletions(-)
 delete mode 100644 crypto/blst_src/blst.h
 delete mode 100644 crypto/blst_src/blst_aux.h

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index dc8b642de66..45811478429 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -25,7 +25,7 @@ int get_Fr_BYTES() {
 
 // Fr utilities
 // Montgomery constant R related to the curve order r
-const Fr BLS12_381_rR = (Fr){  /* (1<<256)%r */
+const limb_t BLS12_381_rR[Fr_LIMBS] = {  /* (1<<256)%r */
     TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5),
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe)
 };
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 4939e15135e..9a874f6e9d3 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -92,7 +92,7 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
 void     map_to_G1(ep_t, const byte*, const int);
 
 // Fr utilities
-extern const Fr BLS12_381_rR;
+extern const limb_t BLS12_381_rR[Fr_LIMBS];
 bool_t      Fr_is_zero(const Fr* a);
 bool_t      Fr_is_equal(const Fr* a, const Fr* b);
 void        Fr_set_limb(Fr*, const limb_t);
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index e6f94716d9b..75542763f6a 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -16,9 +16,8 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind
     Fr denominator; // eventually would represent D*R^k 
 
     // Initialize N and D to Montgomery constant R
-    // TODO: hardcode R
-    Fr_copy(&numerator, &BLS12_381_rR);
-    Fr_copy(&denominator, &BLS12_381_rR);
+    Fr_copy(&numerator, (Fr*)BLS12_381_rR);
+    Fr_copy(&denominator, (Fr*)BLS12_381_rR);
 
     // sign of D: 0 for positive and 1 for negative
     int sign = 0; 
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 77c06a9e5e5..7af94ea3b17 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -6,13 +6,67 @@
 // extra tools to use BLST low level that are needed by the Flow crypto library
 // eventually this file would replace blst.h
 
-#include "blst.h" // TODO: should be deleted
+//#include "blst.h" // TODO: should be deleted
 #include "point.h"
 #include "consts.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
 
+// Parts of this file have been copied from blst.h in the BLST repo
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__ size_t;
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
+                            && defined(__UINT64_TYPE__)
+typedef __UINT8_TYPE__  uint8_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __UINT64_TYPE__ uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#elif defined(__BLST_CGO__)
+typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
+# define bool _Bool
+#else
+# define bool int
+#endif
+
+#ifdef SWIG
+# define DEFNULL =NULL
+#elif defined __cplusplus
+# define DEFNULL =0
+#else
+# define DEFNULL
+#endif
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+    BLST_BAD_SCALAR,
+} BLST_ERROR;
+
+typedef uint8_t byte;
+typedef uint64_t limb_t;
+
 // field elements F_r
 // where `r` is the order of G1/G2.
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
diff --git a/crypto/blst_src/blst.h b/crypto/blst_src/blst.h
deleted file mode 100644
index 24213ded2c5..00000000000
--- a/crypto/blst_src/blst.h
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef __BLST_H__
-#define __BLST_H__
-
-#ifdef __SIZE_TYPE__
-typedef __SIZE_TYPE__ size_t;
-#else
-#include <stddef.h>
-#endif
-
-#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
-                            && defined(__UINT64_TYPE__)
-typedef __UINT8_TYPE__  uint8_t;
-typedef __UINT32_TYPE__ uint32_t;
-typedef __UINT64_TYPE__ uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#elif defined(__BLST_CGO__)
-typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
-# define bool _Bool
-#else
-# define bool int
-#endif
-
-#ifdef SWIG
-# define DEFNULL =NULL
-#elif defined __cplusplus
-# define DEFNULL =0
-#else
-# define DEFNULL
-#endif
-
-typedef enum {
-    BLST_SUCCESS = 0,
-    BLST_BAD_ENCODING,
-    BLST_POINT_NOT_ON_CURVE,
-    BLST_POINT_NOT_IN_GROUP,
-    BLST_AGGR_TYPE_MISMATCH,
-    BLST_VERIFY_FAIL,
-    BLST_PK_IS_INFINITY,
-    BLST_BAD_SCALAR,
-} BLST_ERROR;
-
-typedef uint8_t byte;
-typedef uint64_t limb_t;
-
-typedef struct { byte b[256/8]; } blst_scalar;
-typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr;
-typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp;
-/* 0 is "real" part, 1 is "imaginary" */
-typedef struct { blst_fp fp[2]; } blst_fp2;
-typedef struct { blst_fp2 fp2[3]; } blst_fp6;
-typedef struct { blst_fp6 fp6[2]; } blst_fp12;
-
-void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]);
-void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a);
-void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]);
-void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a);
-void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]);
-void blst_bendian_from_scalar(byte out[32], const blst_scalar *a);
-void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]);
-void blst_lendian_from_scalar(byte out[32], const blst_scalar *a);
-bool blst_scalar_fr_check(const blst_scalar *a);
-bool blst_sk_check(const blst_scalar *a);
-bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a,
-                                           const blst_scalar *b);
-bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a,
-                                           const blst_scalar *b);
-bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a,
-                                           const blst_scalar *b);
-void blst_sk_inverse(blst_scalar *out, const blst_scalar *a);
-bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len);
-bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len);
-
-#ifndef SWIG
-/*
- * BLS12-381-specifc Fr operations.
- */
-void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b);
-void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b);
-void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a);
-void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count);
-void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count);
-void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b);
-void blst_fr_sqr(blst_fr *ret, const blst_fr *a);
-void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag);
-void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a);
-void blst_fr_inverse(blst_fr *ret, const blst_fr *a);
-#ifdef BLST_FR_PENTAROOT
-void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a);
-void blst_fr_pentapow(blst_fr *ret, const blst_fr *a);
-#endif
-
-void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]);
-void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a);
-void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a);
-void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a);
-
-/*
- * BLS12-381-specifc Fp operations.
- */
-void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b);
-void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b);
-void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a);
-void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a);
-void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count);
-void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b);
-void blst_fp_sqr(blst_fp *ret, const blst_fp *a);
-void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag);
-void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a);
-void blst_fp_inverse(blst_fp *ret, const blst_fp *a);
-bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a);
-
-void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]);
-void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a);
-void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]);
-void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a);
-void blst_fp_from_bendian(blst_fp *ret, const byte a[48]);
-void blst_bendian_from_fp(byte ret[48], const blst_fp *a);
-void blst_fp_from_lendian(blst_fp *ret, const byte a[48]);
-void blst_lendian_from_fp(byte ret[48], const blst_fp *a);
-
-/*
- * BLS12-381-specifc Fp2 operations.
- */
-void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
-void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
-void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a);
-void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a);
-void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count);
-void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
-void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a);
-void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag);
-void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a);
-void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a);
-bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a);
-
-/*
- * BLS12-381-specifc Fp12 operations.
- */
-void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a);
-void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a);
-void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b);
-void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a,
-                                             const blst_fp6 *xy00z0);
-void blst_fp12_conjugate(blst_fp12 *a);
-void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a);
-/* caveat lector! |n| has to be non-zero and not more than 3! */
-void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n);
-bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b);
-bool blst_fp12_is_one(const blst_fp12 *a);
-bool blst_fp12_in_group(const blst_fp12 *a);
-const blst_fp12 *blst_fp12_one();
-#endif  // SWIG
-
-/*
- * BLS12-381-specifc point operations.
- */
-typedef struct { blst_fp x, y, z; } blst_p1;
-typedef struct { blst_fp x, y; } blst_p1_affine;
-
-void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
-void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
-void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a,
-                                      const blst_p1_affine *b);
-void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a,
-                                                const blst_p1_affine *b);
-void blst_p1_double(blst_p1 *out, const blst_p1 *a);
-void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
-                                                  size_t nbits);
-void blst_p1_cneg(blst_p1 *p, bool cbit);
-void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in);
-void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in);
-bool blst_p1_on_curve(const blst_p1 *p);
-bool blst_p1_in_g1(const blst_p1 *p);
-bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b);
-bool blst_p1_is_inf(const blst_p1 *a);
-const blst_p1 *blst_p1_generator();
-
-bool blst_p1_affine_on_curve(const blst_p1_affine *p);
-bool blst_p1_affine_in_g1(const blst_p1_affine *p);
-bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b);
-bool blst_p1_affine_is_inf(const blst_p1_affine *a);
-const blst_p1_affine *blst_p1_affine_generator();
-
-typedef struct { blst_fp2 x, y, z; } blst_p2;
-typedef struct { blst_fp2 x, y; } blst_p2_affine;
-
-void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
-void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
-void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a,
-                                      const blst_p2_affine *b);
-void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a,
-                                                const blst_p2_affine *b);
-void blst_p2_double(blst_p2 *out, const blst_p2 *a);
-void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
-                                                  size_t nbits);
-void blst_p2_cneg(blst_p2 *p, bool cbit);
-void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in);
-void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in);
-bool blst_p2_on_curve(const blst_p2 *p);
-bool blst_p2_in_g2(const blst_p2 *p);
-bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b);
-bool blst_p2_is_inf(const blst_p2 *a);
-const blst_p2 *blst_p2_generator();
-
-bool blst_p2_affine_on_curve(const blst_p2_affine *p);
-bool blst_p2_affine_in_g2(const blst_p2_affine *p);
-bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b);
-bool blst_p2_affine_is_inf(const blst_p2_affine *a);
-const blst_p2_affine *blst_p2_affine_generator();
-
-/*
- * Multi-scalar multiplications and other multi-point operations.
- */
-
-void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[],
-                        size_t npoints);
-void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[],
-                                size_t npoints);
-
-size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
-void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits,
-                                    const blst_p1_affine *const points[],
-                                    size_t npoints);
-size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints);
-void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[],
-                         size_t wbits, size_t npoints,
-                         const byte *const scalars[], size_t nbits,
-                         limb_t *scratch);
-
-size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints);
-void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
-                             size_t npoints, const byte *const scalars[],
-                             size_t nbits, limb_t *scratch);
-void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
-                             size_t npoints, const byte *const scalars[],
-                             size_t nbits, limb_t *scratch,
-                             size_t bit0, size_t window);
-
-void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[],
-                        size_t npoints);
-void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[],
-                                size_t npoints);
-
-size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
-void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits,
-                                    const blst_p2_affine *const points[],
-                                    size_t npoints);
-size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints);
-void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[],
-                         size_t wbits, size_t npoints,
-                         const byte *const scalars[], size_t nbits,
-                         limb_t *scratch);
-
-size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints);
-void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
-                             size_t npoints, const byte *const scalars[],
-                             size_t nbits, limb_t *scratch);
-void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
-                             size_t npoints, const byte *const scalars[],
-                             size_t nbits, limb_t *scratch,
-                             size_t bit0, size_t window);
-
-/*
- * Hash-to-curve operations.
- */
-#ifndef SWIG
-void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL);
-void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL);
-#endif
-
-void blst_encode_to_g1(blst_p1 *out,
-                       const byte *msg, size_t msg_len,
-                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
-                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
-void blst_hash_to_g1(blst_p1 *out,
-                     const byte *msg, size_t msg_len,
-                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
-                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
-
-void blst_encode_to_g2(blst_p2 *out,
-                       const byte *msg, size_t msg_len,
-                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
-                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
-void blst_hash_to_g2(blst_p2 *out,
-                     const byte *msg, size_t msg_len,
-                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
-                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
-
-/*
- * Zcash-compatible serialization/deserialization.
- */
-void blst_p1_serialize(byte out[96], const blst_p1 *in);
-void blst_p1_compress(byte out[48], const blst_p1 *in);
-void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in);
-void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in);
-BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]);
-BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]);
-
-void blst_p2_serialize(byte out[192], const blst_p2 *in);
-void blst_p2_compress(byte out[96], const blst_p2 *in);
-void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in);
-void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in);
-BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]);
-BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]);
-
-/*
- * Specification defines two variants, 'minimal-signature-size' and
- * 'minimal-pubkey-size'. To unify appearance we choose to distinguish
- * them by suffix referring to the public key type, more specifically
- * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to
- * 'minimal-signature-size'. It might appear a bit counterintuitive
- * in sign call, but no matter how you twist it, something is bound to
- * turn a little odd.
- */
-/*
- * Secret-key operations.
- */
-void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
-                 const byte *info DEFNULL, size_t info_len DEFNULL);
-void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK);
-void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash,
-                                          const blst_scalar *SK);
-void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK);
-void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash,
-                                          const blst_scalar *SK);
-
-/*
- * Pairing interface.
- */
-#ifndef SWIG
-void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q,
-                                      const blst_p1_affine *P);
-void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f);
-void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q);
-void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68],
-                                            const blst_p1_affine *P);
-bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2);
-#endif
-
-#ifdef __BLST_CGO__
-typedef limb_t blst_pairing;
-#elif defined(__BLST_RUST_BINDGEN__)
-typedef struct {} blst_pairing;
-#else
-typedef struct blst_opaque blst_pairing;
-#endif
-
-size_t blst_pairing_sizeof();
-void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode,
-                       const byte *DST DEFNULL, size_t DST_len DEFNULL);
-const byte *blst_pairing_get_dst(const blst_pairing *ctx);
-void blst_pairing_commit(blst_pairing *ctx);
-BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx,
-                                           const blst_p2_affine *PK,
-                                           const blst_p1_affine *signature,
-                                           const byte *msg, size_t msg_len,
-                                           const byte *aug DEFNULL,
-                                           size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx,
-                                            const blst_p2_affine *PK,
-                                            bool pk_grpchk,
-                                            const blst_p1_affine *signature,
-                                            bool sig_grpchk,
-                                            const byte *msg, size_t msg_len,
-                                            const byte *aug DEFNULL,
-                                            size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx,
-                                                 const blst_p2_affine *PK,
-                                                 const blst_p1_affine *sig,
-                                                 const byte *scalar,
-                                                 size_t nbits,
-                                                 const byte *msg,
-                                                 size_t msg_len,
-                                                 const byte *aug DEFNULL,
-                                                 size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx,
-                                                  const blst_p2_affine *PK,
-                                                  bool pk_grpchk,
-                                                  const blst_p1_affine *sig,
-                                                  bool sig_grpchk,
-                                                  const byte *scalar,
-                                                  size_t nbits,
-                                                  const byte *msg,
-                                                  size_t msg_len,
-                                                  const byte *aug DEFNULL,
-                                                  size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx,
-                                           const blst_p1_affine *PK,
-                                           const blst_p2_affine *signature,
-                                           const byte *msg, size_t msg_len,
-                                           const byte *aug DEFNULL,
-                                           size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx,
-                                            const blst_p1_affine *PK,
-                                            bool pk_grpchk,
-                                            const blst_p2_affine *signature,
-                                            bool sig_grpchk,
-                                            const byte *msg, size_t msg_len,
-                                            const byte *aug DEFNULL,
-                                            size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx,
-                                                 const blst_p1_affine *PK,
-                                                 const blst_p2_affine *sig,
-                                                 const byte *scalar,
-                                                 size_t nbits,
-                                                 const byte *msg,
-                                                 size_t msg_len,
-                                                 const byte *aug DEFNULL,
-                                                 size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx,
-                                                  const blst_p1_affine *PK,
-                                                  bool pk_grpchk,
-                                                  const blst_p2_affine *sig,
-                                                  bool sig_grpchk,
-                                                  const byte *scalar,
-                                                  size_t nbits,
-                                                  const byte *msg,
-                                                  size_t msg_len,
-                                                  const byte *aug DEFNULL,
-                                                  size_t aug_len DEFNULL);
-BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1);
-bool blst_pairing_finalverify(const blst_pairing *ctx,
-                              const blst_fp12 *gtsig DEFNULL);
-
-
-/*
- * Customarily applications aggregate signatures separately.
- * In which case application would have to pass NULLs for |signature|
- * to blst_pairing_aggregate calls and pass aggregated signature
- * collected with these calls to blst_pairing_finalverify. Inputs are
- * Zcash-compatible "straight-from-wire" byte vectors, compressed or
- * not.
- */
-BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in,
-                                              const byte *zwire);
-BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in,
-                                              const byte *zwire);
-
-void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature);
-void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature);
-
-/*
- * "One-shot" CoreVerify entry points.
- */
-BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk,
-                                     const blst_p2_affine *signature,
-                                     bool hash_or_encode,
-                                     const byte *msg, size_t msg_len,
-                                     const byte *DST DEFNULL,
-                                     size_t DST_len DEFNULL,
-                                     const byte *aug DEFNULL,
-                                     size_t aug_len DEFNULL);
-BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk,
-                                     const blst_p1_affine *signature,
-                                     bool hash_or_encode,
-                                     const byte *msg, size_t msg_len,
-                                     const byte *DST DEFNULL,
-                                     size_t DST_len DEFNULL,
-                                     const byte *aug DEFNULL,
-                                     size_t aug_len DEFNULL);
-
-extern const blst_p1_affine BLS12_381_G1;
-extern const blst_p1_affine BLS12_381_NEG_G1;
-extern const blst_p2_affine BLS12_381_G2;
-extern const blst_p2_affine BLS12_381_NEG_G2;
-
-#include "blst_aux.h"
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/crypto/blst_src/blst_aux.h b/crypto/blst_src/blst_aux.h
deleted file mode 100644
index 6d444fc1729..00000000000
--- a/crypto/blst_src/blst_aux.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
-#ifndef __BLST_AUX_H__
-#define __BLST_AUX_H__
-/*
- * This file lists interfaces that might be promoted to blst.h or removed,
- * depending on their proven/unproven worthiness.
- */
-
-void blst_fr_to(blst_fr *ret, const blst_fr *a);
-void blst_fr_from(blst_fr *ret, const blst_fr *a);
-
-void blst_fp_to(blst_fp *ret, const blst_fp *a);
-void blst_fp_from(blst_fp *ret, const blst_fp *a);
-
-bool blst_fp_is_square(const blst_fp *a);
-bool blst_fp2_is_square(const blst_fp2 *a);
-
-void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in);
-void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in);
-
-/*
- * Below functions produce both point and deserialized outcome of
- * SkToPk and Sign. However, deserialized outputs are pre-decorated
- * with sign and infinity bits. This means that you have to bring the
- * output into compliance prior returning to application. If you want
- * compressed point value, then do [equivalent of]
- *
- *  byte temp[96];
- *  blst_sk_to_pk2_in_g1(temp, out_pk, SK);
- *  temp[0] |= 0x80;
- *  memcpy(out, temp, 48);
- *
- * Otherwise do
- *
- *  blst_sk_to_pk2_in_g1(out, out_pk, SK);
- *  out[0] &= ~0x20;
- *
- * Either |out| or |out_<point>| can be NULL.
- */
-void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk,
-                          const blst_scalar *SK);
-void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig,
-                         const blst_p2 *hash, const blst_scalar *SK);
-void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk,
-                          const blst_scalar *SK);
-void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig,
-                         const blst_p1 *hash, const blst_scalar *SK);
-
-typedef struct {} blst_uniq;
-
-size_t blst_uniq_sizeof(size_t n_nodes);
-void blst_uniq_init(blst_uniq *tree);
-bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len);
-
-#ifdef expand_message_xmd
-void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
-                        const unsigned char *aug, size_t aug_len,
-                        const unsigned char *msg, size_t msg_len,
-                        const unsigned char *DST, size_t DST_len);
-#else
-void blst_expand_message_xmd(byte *out, size_t out_len,
-                             const byte *msg, size_t msg_len,
-                             const byte *DST, size_t DST_len);
-#endif
-
-void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
-                                                            size_t nbits);
-void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
-                                                            size_t nbits);
-
-void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q,
-                                                   const blst_p1_affine *p);
-blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx);
-void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a);
-
-void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
-                    const byte *info DEFNULL, size_t info_len DEFNULL);
-void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
-                      const byte *salt, size_t salt_len,
-                      const byte *info DEFNULL, size_t info_len DEFNULL);
-void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
-                    const byte *salt, size_t salt_len,
-                    const byte *info DEFNULL, size_t info_len DEFNULL);
-void blst_derive_master_eip2333(blst_scalar *out_SK,
-                                const byte *IKM, size_t IKM_len);
-void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK,
-                               uint32_t child_index);
-
-void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex);
-void blst_fr_from_hexascii(blst_fr *ret, const byte *hex);
-void blst_fp_from_hexascii(blst_fp *ret, const byte *hex);
-
-size_t blst_p1_sizeof();
-size_t blst_p1_affine_sizeof();
-size_t blst_p2_sizeof();
-size_t blst_p2_affine_sizeof();
-size_t blst_fp12_sizeof();
-#endif

From 4979760110e2f23e82fbd6437677b9e6a1c1c4fa Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 3 Mar 2023 17:44:09 -0600
Subject: [PATCH 019/200] fix double definition

---
 crypto/blst_include.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 7af94ea3b17..9052964f361 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -65,7 +65,6 @@ typedef enum {
 } BLST_ERROR;
 
 typedef uint8_t byte;
-typedef uint64_t limb_t;
 
 // field elements F_r
 // where `r` is the order of G1/G2.

From 90c412fd01d765ce006eeb350ad1e64400b8ab8e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sun, 5 Mar 2023 20:54:50 -0600
Subject: [PATCH 020/200] enable tmate on ci temporarily

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9b977950c97..e360717d043 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -115,6 +115,8 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
     - name: Run tests (${{ matrix.targets.name }})
       if: github.actor != 'bors[bot]'
       uses: nick-fields/retry@v2

From d96d6fa6b9b7354a72e55eaf7e16e2cacc0a1931 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sun, 5 Mar 2023 23:47:33 -0600
Subject: [PATCH 021/200] test improvement and temporary memory free

---
 crypto/bls12381_utils.c |  9 ++++-----
 crypto/bls_test.go      | 22 +++++++++++-----------
 crypto/dkg_core.c       |  2 ++
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 45811478429..10531c09602 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -30,11 +30,6 @@ const limb_t BLS12_381_rR[Fr_LIMBS] = {  /* (1<<256)%r */
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe)
 };
 
-/*0x1824b159acc5056f
-0x998c4fefecbc4ff5
-0x5884b7fa00034802
-0x00000001fffffffe*/
-
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
     bn_st* out = (bn_st*)malloc(sizeof(bn_st)); 
@@ -366,6 +361,7 @@ void ep_mult(ep_t res, const ep_t p, const Fr *expo) {
     bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using window NAF of size 2 
     ep_mul_lwnaf(res, p, tmp_expo);
+    free(tmp_expo);
 }
 
 // Exponentiation of generator g1 in G1
@@ -374,6 +370,7 @@ void ep_mult_gen_bench(ep_t res, const Fr* expo) {
     bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using precomputed table of size 4
     ep_mul_gen(res, tmp_expo);
+    free(tmp_expo);
 }
 
 void ep_mult_generic_bench(ep_t res, const Fr* expo) {
@@ -386,6 +383,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
     bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using window NAF of size 2
     ep2_mul_lwnaf(res, p, tmp_expo);
+    free(tmp_expo);
 }
 
 // Exponentiation of generator g2 in G2
@@ -393,6 +391,7 @@ void ep2_mult_gen(ep2_t res, const Fr* expo) {
     bn_st* tmp_expo = Fr_blst_to_relic(expo);
     // Using precomputed table of size 4
     g2_mul_gen(res, tmp_expo);
+    free(tmp_expo);
 }
 
 // DEBUG printing functions 
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 8aec95a8b03..a9672b8eeb7 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -7,11 +7,9 @@ import (
 	"crypto/rand"
 	"encoding/hex"
 	"fmt"
-	_ "math/rand"
 	mrand "math/rand"
 	"testing"
 	"time"
-	_ "time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -69,7 +67,7 @@ func BenchmarkBLSBLS12381Verify(b *testing.B) {
 
 // utility function to generate a random BLS private key
 func randomSK(t *testing.T, seed []byte) PrivateKey {
-	n, err := rand.Read(seed)
+	n, err := mrand.Read(seed)
 	require.Equal(t, n, KeyGenSeedMinLen)
 	require.NoError(t, err)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
@@ -270,23 +268,23 @@ func TestBLSPOP(t *testing.T) {
 }
 
 // BLS multi-signature
-// signature aggregation sanity check
+// signature aggregation with the same message sanity check
 //
 // Aggregate n signatures of the same message under different keys, and compare
 // it against the signature of the message under an aggregated private key.
 // Verify the aggregated signature using the multi-signature verification with
 // one message.
-func TestBLSAggregateSignatures(t *testing.T) {
+func TestBLSAggregateSignaturesSameMessage(t *testing.T) {
+	r := time.Now().UnixNano()
+	mrand.Seed(r)
+	t.Logf("math rand seed is %d", r)
 	// random message
 	input := make([]byte, 100)
-	_, err := rand.Read(input)
+	_, err := mrand.Read(input)
 	require.NoError(t, err)
 	// hasher
 	kmac := NewExpandMsgXOFKMAC128("test tag")
 	// number of signatures to aggregate
-	r := time.Now().UnixNano()
-	mrand.Seed(r)
-	t.Logf("math rand seed is %d", r)
 	sigsNum := mrand.Intn(100) + 1
 	sigs := make([]Signature, 0, sigsNum)
 	sks := make([]PrivateKey, 0, sigsNum)
@@ -330,19 +328,21 @@ func TestBLSAggregateSignatures(t *testing.T) {
 	t.Run("one invalid signature", func(t *testing.T) {
 		input[0] ^= 1
 		randomIndex := mrand.Intn(sigsNum)
-		sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac)
+		sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // sign a different message
 		input[0] ^= 1
 		aggSig, err = AggregateBLSSignatures(sigs)
 		require.NoError(t, err)
+		// First check: check the signatures are not equal
 		assert.NotEqual(t, aggSig, expectedSig,
 			"signature %s shouldn't be %s private keys are %s, input is %x",
 			aggSig, expectedSig, sks, input)
+		// Second check: multi-verification should fail
 		valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac)
 		require.NoError(t, err)
 		assert.False(t, valid,
 			"verification of signature %s should fail, it shouldn't be %s private keys are %s, input is %x",
 			aggSig, expectedSig, sks, input)
-		sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac)
+		sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature
 		require.NoError(t, err)
 	})
 
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 34d6addbffb..9ca0e7a821e 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -36,6 +36,7 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const
     if (y) {
         bn_st* tmp = Fr_blst_to_relic(image);
         g2_mul_gen(y, tmp);
+        free(tmp);
     }
 }
 
@@ -102,6 +103,7 @@ int verifyshare(const Fr* x, const ep2_t y) {
     ep2_new(res);
     bn_st* x_tmp = Fr_blst_to_relic(x);
     g2_mul_gen(res, x_tmp);
+    free(x_tmp);
     return (ep2_cmp(res, (ep2_st*)y) == RLC_EQ);
 }
 

From 0593da596b5e67e61a2907af93fd34678f993ffb Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 6 Mar 2023 08:46:20 -0600
Subject: [PATCH 022/200] fix memory allocation bug in temp function

---
 crypto/bls12381_utils.c          | 1 +
 crypto/bls_thresholdsign_test.go | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 10531c09602..c4f2d10b632 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -35,6 +35,7 @@ bn_st* Fr_blst_to_relic(const Fr* x) {
     bn_st* out = (bn_st*)malloc(sizeof(bn_st)); 
     byte* data = (byte*)malloc(Fr_BYTES);
     be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES);
+    out->alloc = RLC_DV_DIGS;
     bn_read_bin(out, data, Fr_BYTES);
     free(data);
     return out;
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 5473b454827..0d7f7204a79 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -31,10 +31,12 @@ var thresholdSignatureMessage = []byte("random message")
 
 // centralized test of the stateful threshold signature using the threshold key generation.
 func testCentralizedStatefulAPI(t *testing.T) {
+	r := time.Now().UnixNano()
+	mrand.Seed(r)
+	t.Log(r)
 	n := 10
 	for threshold := MinimumThreshold; threshold < n; threshold++ {
 		// generate threshold keys
-		mrand.Seed(time.Now().UnixNano())
 		seed := make([]byte, SeedMinLenDKG)
 		_, err := mrand.Read(seed)
 		require.NoError(t, err)

From 0e8829e7c8194fd89b344318dcb9f0080b486b97 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 6 Mar 2023 09:22:06 -0600
Subject: [PATCH 023/200] Revert "enable tmate on ci temporarily"

This reverts commit 90c412fd01d765ce006eeb350ad1e64400b8ab8e.
---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e360717d043..9b977950c97 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -115,8 +115,6 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Setup tmate session
-      uses: mxschmitt/action-tmate@v3
     - name: Run tests (${{ matrix.targets.name }})
       if: github.actor != 'bors[bot]'
       uses: nick-fields/retry@v2

From 52cae3e0af4635ad17ed8edd4a86464db9ede606 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 8 Mar 2023 16:30:28 -0600
Subject: [PATCH 024/200] g1 and g2 exportable types to cgo

---
 crypto/bls12381_utils.c  | 13 ++++++++-----
 crypto/bls12381_utils.go |  4 ++--
 crypto/bls12381_utils.h  |  2 +-
 crypto/blst_include.h    | 40 ++++++++++++++++++++++++++++++++--------
 4 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index c4f2d10b632..f1a93173971 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -182,6 +182,10 @@ static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
     }
 }
 
+static void pow256_from_Fr(pow256 ret, const Fr* in) {
+    le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES);
+}
+
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
 // input bytes are big endian.
 // returns:
@@ -388,11 +392,10 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
 }
 
 // Exponentiation of generator g2 in G2
-void ep2_mult_gen(ep2_t res, const Fr* expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using precomputed table of size 4
-    g2_mul_gen(res, tmp_expo);
-    free(tmp_expo);
+void G2_mult_gen(G2* res, const Fr* expo) {
+    pow256 tmp;
+    pow256_from_Fr(tmp, expo);
+    POINTonE2_sign(res, &BLS12_381_G2, tmp);
 }
 
 // DEBUG printing functions 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index e2ee855e081..6f093d57812 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -21,7 +21,7 @@ import (
 // Go wrappers around BLST C types
 // Go wrappers around Relic C types
 type pointG1 C.ep_st
-type pointG2 C.ep2_st
+type pointG2 C.G2
 type scalar C.Fr
 
 // BLS12-381 related lengths
@@ -96,7 +96,7 @@ func genericScalarMultG1(res *pointG1, expo *scalar) {
 
 // Exponentiation of g2 in G2
 func generatorScalarMultG2(res *pointG2, expo *scalar) {
-	C.ep2_mult_gen((*C.ep2_st)(res), (*C.Fr)(expo))
+	C.G2_mult_gen((*C.G2)(res), (*C.Fr)(expo))
 }
 
 // comparison in Fr where r is the group order of G1/G2
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 9a874f6e9d3..e0c5fed472c 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -131,7 +131,7 @@ void        ep2_write_bin_compact(byte *, const ep2_t,  const int);
 void     ep_mult_gen_bench(ep_t, const Fr*);
 void     ep_mult_generic_bench(ep_t, const Fr*);
 void     ep_mult(ep_t, const ep_t, const Fr*);
-void     ep2_mult_gen(ep2_t, const Fr*);
+void     G2_mult_gen(ep2_t, const Fr*);
 void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
 
 void     ep_sum_vector(ep_t, ep_st*, const int);
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 9052964f361..0ee8e99ddb2 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -6,9 +6,9 @@
 // extra tools to use BLST low level that are needed by the Flow crypto library
 // eventually this file would replace blst.h
 
-//#include "blst.h" // TODO: should be deleted
 #include "point.h"
 #include "consts.h"
+#include "bls12381_utils.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
@@ -35,6 +35,8 @@ typedef __UINT64_TYPE__ uint64_t;
 #include <stdint.h>
 #endif
 
+typedef uint8_t byte;
+
 #ifdef __cplusplus
 extern "C" {
 #elif defined(__BLST_CGO__)
@@ -64,17 +66,39 @@ typedef enum {
     BLST_BAD_SCALAR,
 } BLST_ERROR;
 
-typedef uint8_t byte;
-
 // field elements F_r
 // where `r` is the order of G1/G2.
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
 // are represented as a little endian vector of limbs.
-// `Fr` is equivalent to type vec256 (used internally by BLST for F_r elements).
-typedef struct {limb_t limbs[4];} Fr;
+// `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements).
+// `Fr` is defined as a struct to be exportable through cgo to the Go layer.
+typedef struct {limb_t limbs[Fr_LIMBS];} Fr;
+
+// field elements F_p
+// F_p elements are represented as big numbers reduced modulo `p`. Big numbers
+// are represented as a little endian vector of limbs.
+// `Fp` is equivalent to type `vec384` (used internally by BLST for F_p elements).
+// `Fp` does not need to be exported to cgo.
+typedef vec384 Fp;
+
 // Subroup G1 in E1
-typedef POINTonE1 G1;
-// Subroup G1 in E2
-typedef POINTonE2 G2;
+// G1 points are represented in Jacobian coordinates (x,y,z), 
+// where x, y, x are elements of F_p (type `Fp`).
+// `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements)
+// `G1` is defined as a struct to be exportable through cgo to the Go layer.
+typedef struct {Fp x,y,z} G1;
+
+// field elements F_p^2
+// F_p^2 elements are represented as a vector of two F_p elements.
+// `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 elements).
+// `Fp2` does not need to be exported to cgo.
+typedef vec384x Fp2;
+
+// Subroup G2 in E2
+// G2 points are represented in Jacobian coordinates (x,y,z), 
+// where x, y, x are elements of F_p (type `Fp`).
+// `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements)
+// `G2` is defined as a struct to be exportable through cgo to the Go layer.
+typedef struct {Fp2 x,y,z} G2;
 
 #endif

From e783a4289e4c02d5e63fc8be8014ce34f6850ff1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 5 Apr 2023 18:57:40 -0600
Subject: [PATCH 025/200] new Fp and Fp2 tools, but still the mess

---
 crypto/bls12381_utils.c  | 714 +++++++++++++++++++++++----------------
 crypto/bls12381_utils.go |   6 +-
 crypto/bls12381_utils.h  |   8 +-
 crypto/dkg_core.c        |   2 +-
 4 files changed, 438 insertions(+), 292 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index f1a93173971..3d2a1b99f6a 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -23,7 +23,101 @@ int get_Fr_BYTES() {
     return Fr_BYTES;
 }
 
-// Fr utilities
+
+// Initializes Relic context with BLS12-381 parameters
+ctx_t* relic_init_BLS12_381() { 
+    // check Relic was compiled with the right conf 
+    assert(ALLOC == AUTO);
+
+    // sanity check of Relic constants the package is relying on
+    assert(RLC_OK == RLC_EQ);
+
+    // initialize relic core with a new context
+    ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t));
+    if (!bls_ctx) return NULL;
+    core_set(bls_ctx);
+    if (core_init() != RLC_OK) return NULL;
+
+    // init BLS curve
+    int ret = RLC_OK;
+    #if (FP_PRIME == 381)
+    ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config
+    #else
+    ep_param_set(B12_P381);
+    ep2_curve_set_twist(EP_MTYPE);  // Multiplicative twist 
+    #endif 
+
+    if (ret != RLC_OK) return NULL;
+    return core_get();
+}
+
+// seeds relic PRG
+void seed_relic(byte* seed, int len) {
+    #if RAND == HASHD
+    // instantiate a new DRBG
+    ctx_t *ctx = core_get();
+    ctx->seeded = 0;
+    #endif
+    rand_seed(seed, len);
+}
+
+// global variable of the pre-computed data
+prec_st bls_prec_st;
+prec_st* bls_prec = NULL;
+
+// required constants for the optimized SWU hash to curve
+#if (hashToPoint == LOCAL_SSWU)
+extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS];
+extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS];
+#endif
+
+#if (MEMBERSHIP_CHECK_G1 == BOWE)
+extern const uint64_t beta_data[Fp_LIMBS];
+extern const uint64_t z2_1_by3_data[2];
+#endif
+
+// sets the global variable to input
+void precomputed_data_set(const prec_st* p) {
+    bls_prec = (prec_st*)p;
+}
+
+// pre-compute some data required for curve BLS12-381
+prec_st* init_precomputed_data_BLS12_381() {
+    bls_prec = &bls_prec_st;
+    ctx_t* ctx = core_get();
+
+    // (p-1)/2
+    bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2);
+    #if (hashToPoint == LOCAL_SSWU)
+    // (p-3)/4
+    bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2);
+    // sqrt(-z)
+    fp_neg(bls_prec->sqrt_z, ctx->ep_map_u);
+    fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z);
+    // -a1 and a1*z
+    fp_neg(bls_prec->minus_a1, ctx->ep_iso.a);
+    fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u);
+    
+    for (int i=0; i<ELLP_Nx_LEN; i++)  
+        fp_read_raw(bls_prec->iso_Nx[i], iso_Nx_data[i]);
+    for (int i=0; i<ELLP_Ny_LEN; i++)  
+        fp_read_raw(bls_prec->iso_Ny[i], iso_Ny_data[i]);
+    #endif
+
+    #if (MEMBERSHIP_CHECK_G1 == BOWE)
+    bn_new(&bls_prec->beta);
+    bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS);
+    bn_new(&bls_prec->z2_1_by3);
+    bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2);
+    #endif
+
+    // Montgomery constant R
+    fp_set_dig(bls_prec->r, 1);
+    return bls_prec;
+}
+
+// ------------------- Fr utilities
+
 // Montgomery constant R related to the curve order r
 const limb_t BLS12_381_rR[Fr_LIMBS] = {  /* (1<<256)%r */
     TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5),
@@ -187,18 +281,19 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) {
 }
 
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
-// input bytes are big endian.
+// input is bytes-big-endian.
 // returns:
 //    - BLST_BAD_ENCODING if the length is invalid
 //    - BLST_BAD_SCALAR if the scalar isn't in Fr
-//    - v if the scalar is valid 
+//    - BLST_SUCCESS if the scalar is valid 
 BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
     if (len != Fr_BYTES) {
         return BLST_BAD_ENCODING;
     }
     pow256 tmp;
+    // compare to r using the provided tool from BLST 
     pow256_from_be_bytes(tmp, bin);
-    if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
+    if (!check_mod_256(tmp, BLS12_381_r)) {  // check_mod_256 compares pow256 against a vec256!
         return BLST_BAD_SCALAR;
     }
     vec_zero(tmp, Fr_BYTES);
@@ -230,9 +325,8 @@ void Fr_write_bytes(uint8_t *bin, const Fr* a) {
 }
 
 // maps big-endian bytes into an Fr element using modular reduction
-// output is vec256 (also used as Fr)
-static void 
-vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
+// Input is byte-big-endian, output is vec256 (also used as Fr)
+static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
 {
     Fr digit, radix;
     Fr_set_zero(out);
@@ -257,236 +351,93 @@ vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
     Fr_set_zero(&digit);
 }
 
-// Reads a scalar from an array and maps it to Fr.
+// Reads a scalar from an array and maps it to Fr using modular reduction.
+// Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
 bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
     vec256_from_be_bytes(a, bin, len);
-    //Fr_set_limb(a, 1); TODO: delete
     return Fr_is_zero(a);
 }
 
-// global variable of the pre-computed data
-prec_st bls_prec_st;
-prec_st* bls_prec = NULL;
-
-// required constants for the optimized SWU hash to curve
-#if (hashToPoint == LOCAL_SSWU)
-extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS];
-extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS];
-#endif
-
-#if (MEMBERSHIP_CHECK_G1 == BOWE)
-extern const uint64_t beta_data[Fp_LIMBS];
-extern const uint64_t z2_1_by3_data[2];
-#endif
-
-// sets the global variable to input
-void precomputed_data_set(const prec_st* p) {
-    bls_prec = (prec_st*)p;
-}
-
-// Reads a prime field element from a digit vector in big endian format.
-// There is no conversion to Montgomery domain in this function.
- #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
-
-// pre-compute some data required for curve BLS12-381
-prec_st* init_precomputed_data_BLS12_381() {
-    bls_prec = &bls_prec_st;
-    ctx_t* ctx = core_get();
-
-    // (p-1)/2
-    bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2);
-    #if (hashToPoint == LOCAL_SSWU)
-    // (p-3)/4
-    bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2);
-    // sqrt(-z)
-    fp_neg(bls_prec->sqrt_z, ctx->ep_map_u);
-    fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z);
-    // -a1 and a1*z
-    fp_neg(bls_prec->minus_a1, ctx->ep_iso.a);
-    fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u);
-    
-    for (int i=0; i<ELLP_Nx_LEN; i++)  
-        fp_read_raw(bls_prec->iso_Nx[i], iso_Nx_data[i]);
-    for (int i=0; i<ELLP_Ny_LEN; i++)  
-        fp_read_raw(bls_prec->iso_Ny[i], iso_Ny_data[i]);
-    #endif
-
-    #if (MEMBERSHIP_CHECK_G1 == BOWE)
-    bn_new(&bls_prec->beta);
-    bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS);
-    bn_new(&bls_prec->z2_1_by3);
-    bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2);
-    #endif
-
-    // Montgomery constant R
-    fp_set_dig(bls_prec->r, 1);
-    return bls_prec;
-}
-
-// Initializes Relic context with BLS12-381 parameters
-ctx_t* relic_init_BLS12_381() { 
-    // check Relic was compiled with the right conf 
-    assert(ALLOC == AUTO);
-
-    // sanity check of Relic constants the package is relying on
-    assert(RLC_OK == RLC_EQ);
-
-    // initialize relic core with a new context
-    ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t));
-    if (!bls_ctx) return NULL;
-    core_set(bls_ctx);
-    if (core_init() != RLC_OK) return NULL;
-
-    // init BLS curve
-    int ret = RLC_OK;
-    #if (FP_PRIME == 381)
-    ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config
-    #else
-    ep_param_set(B12_P381);
-    ep2_curve_set_twist(EP_MTYPE);  // Multiplicative twist 
-    #endif 
-
-    if (ret != RLC_OK) return NULL;
-    return core_get();
-}
-
-// seeds relic PRG
-void seed_relic(byte* seed, int len) {
-    #if RAND == HASHD
-    // instantiate a new DRBG
-    ctx_t *ctx = core_get();
-    ctx->seeded = 0;
-    #endif
-    rand_seed(seed, len);
-}
-
-// Exponentiation of a generic point p in G1
-void ep_mult(ep_t res, const ep_t p, const Fr *expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using window NAF of size 2 
-    ep_mul_lwnaf(res, p, tmp_expo);
-    free(tmp_expo);
-}
-
-// Exponentiation of generator g1 in G1
-// These two function are here for bench purposes only
-void ep_mult_gen_bench(ep_t res, const Fr* expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using precomputed table of size 4
-    ep_mul_gen(res, tmp_expo);
-    free(tmp_expo);
-}
+// ------------------- Fp utilities
 
-void ep_mult_generic_bench(ep_t res, const Fr* expo) {
-    // generic point multiplication
-    ep_mult(res, &core_get()->ep_g, expo);
-}
+// Montgomery constant R related to the prime p
+const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P };  /* (1<<384)%p */
 
-// Exponentiation of a generic point p in G2
-void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using window NAF of size 2
-    ep2_mul_lwnaf(res, p, tmp_expo);
-    free(tmp_expo);
+// sets `a` to 0
+void Fp_set_zero(Fp* a){
+    vec_zero((byte*)a, Fp_BYTES);
 }
 
-// Exponentiation of generator g2 in G2
-void G2_mult_gen(G2* res, const Fr* expo) {
-    pow256 tmp;
-    pow256_from_Fr(tmp, expo);
-    POINTonE2_sign(res, &BLS12_381_G2, tmp);
+// sets `a` to limb `l`
+void Fp_set_limb(Fp* a, const limb_t l){
+    vec_zero((byte*)a + sizeof(limb_t), Fp_BYTES - sizeof(limb_t));
+    *((limb_t*)a) = l;
 }
 
-// DEBUG printing functions 
-void bytes_print_(char* s, byte* data, int len) {
-    printf("[%s]:\n", s);
-    for (int i=0; i<len; i++) 
-        printf("%02x,", data[i]);
-    printf("\n");
+static bool check_Fp(byte *in) {
+    // use same method as in BLST internal function
+    // which seems the most efficient. The method uses the assembly-based 
+    // modular addition instead of limbs comparison
+    vec384 temp;
+    add_fp(temp, in, ZERO_384); 
+    return vec_is_equal(temp, in, Fp_BYTES);
+    // no need to clear `tmp` as no use-case involves sensitive data being passed as `in`
 }
 
-void Fr_print_(char* s, Fr* a) {
-    printf("[%s]:\n", s);
-    limb_t* p = (limb_t*)(a) + Fr_LIMBS;
-    for (int i=0; i<Fr_LIMBS; i++) 
-        printf("%16llx", *(--p));
-    printf("\n");
+void Fp_copy(Fp* res, const Fp* a) {
+    vec_copy((byte*)res, (byte*)a, Fp_BYTES);
 }
- 
 
-void fp_print_(char* s, fp_st a) {
-    char* str = malloc(sizeof(char) * fp_size_str(a, 16));
-    fp_write_str(str, 100, a, 16);
-    printf("[%s]:\n%s\n", s, str);
-    free(str);
+void Fp_add(Fp *res, const Fp *a, const Fp *b) {
+    add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
 }
 
-void bn_print_(char* s, bn_st *a) {
-    char* str = malloc(sizeof(char) * bn_size_str(a, 16));
-    bn_write_str(str, 128, a, 16);
-    printf("[%s]:\n%s\n", s, str);
-    free(str);
+void Fp_sub(Fp *res, const Fp *a, const Fp *b) {
+    sub_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
 }
 
-void ep_print_(char* s, ep_st* p) {
-    printf("[%s]:\n", s);
-    g1_print(p);
+void Fp_neg(Fp *res, const Fp *a) {
+    cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P);
 }
 
-void ep2_print_(char* s, ep2_st* p) {
-    printf("[%s]:\n", s);
-    g2_print(p);
+// res = a*b*R^(-1)
+void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) {
+    mul_mont_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P, p0);
 }
 
-// returns the sign of y.
-// 1 if y > (p - 1)/2 and 0 otherwise.
-static int fp_get_sign(const fp_t y) {
-    bn_t bn_y;
-    bn_new(bn_y);
-    fp_prime_back(bn_y, y);
-    return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT;		
+// res = a^2 * R^(-1)
+void Fp_squ_montg(Fp *res, const Fp *a) {
+    sqr_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0);
 }
 
-// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form.
-// len is the allocated size of the buffer bin.
-// The serialization is following:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
-// The code is a modified version of Relic ep_write_bin
-void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
-    const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
-
-    if (len!=G1_size) {
-        RLC_THROW(ERR_NO_BUFFER);
-        return;
-    }
- 
-    if (ep_is_infty(a)) {
-            // set the infinity bit
-            bin[0] = (G1_SERIALIZATION << 7) | 0x40;
-            memset(bin+1, 0, G1_size-1);
-            return;
-    }
-
-    RLC_TRY {
-        ep_t t;
-        ep_null(t);
-        ep_new(t); 
-        ep_norm(t, a);
-        fp_write_bin(bin, Fp_BYTES, t->x);
+// res = a*R
+void Fp_to_montg(Fp *res, const Fp *a) {
+    mul_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_RR, BLS12_381_P, p0);
+}
 
-        if (G1_SERIALIZATION == COMPRESSED) {
-            bin[0] |= (fp_get_sign(t->y) << 5);
-        } else {
-            fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y);
-        }
-        ep_free(t);
-    } RLC_CATCH_ANY {
-        RLC_THROW(ERR_CAUGHT);
-    }
+// res = a*R^(-1)
+void Fp_from_montg(Fp *res, const Fp *a) {
+    from_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0);
+}
 
-    bin[0] |= (G1_SERIALIZATION << 7);
- }
+// reads a scalar in `a` and checks it is a valid Fp element (a < p).
+// input is bytes-big-endian.
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fp
+//    - BLST_SUCCESS if the scalar is valid 
+BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
+    if (len != Fp_BYTES) {
+        return BLST_BAD_ENCODING;
+    }
+    limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES);
+    // compare read scalar to p
+    if (!check_Fp(a)) {
+        return BLST_BAD_ENCODING;
+    }       
+    return BLST_SUCCESS;
+}
 
 // fp_read_bin_safe is a modified version of Relic's (void fp_read_bin).
 // It reads a field element from a buffer and makes sure the big number read can be 
@@ -526,6 +477,79 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
     return ret;
 }
 
+// Reads a prime field element from a digit vector in big endian format.
+// There is no conversion to Montgomery domain in this function.
+ #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
+
+// returns the sign of y.
+// 1 if y > (p - 1)/2 and 0 otherwise.
+// y is in montgomery form
+static int Fp_get_sign(const fp_t y) {
+    sgn0_pty_mont_384(y, BLS12_381_P, p0);
+}
+
+// ------------------- Fp^2 utilities
+
+// sets `a` to limb `l`
+void Fp2_set_limb(Fp2* a, const limb_t l){
+    Fp_set_limb(a[0], l);  // TODO: check!!
+    Fp_set_zero(a[1]);
+}
+
+void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+    add_mod_384x(res, a, b, BLS12_381_P);
+}
+
+void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+    sub_mod_384x(res, a, b, BLS12_381_P);
+}
+
+void Fp2_neg(Fp2 *res, const Fp2 *a) {
+    cneg_mod_384(res[0], a[0], 1, BLS12_381_P);
+    cneg_mod_384(res[1], a[1], 1, BLS12_381_P);
+}
+
+// res = a*b in montgomery form
+void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+    mul_mont_384x(res, a, b, BLS12_381_P, p0); 
+}
+
+// res = a^2 in montgomery form
+void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
+    sqr_mont_384x(res, a, BLS12_381_P, p0); 
+}
+
+// returns the sign of y.
+// sign(y_0) if y_1 = 0, else sign(y_1)
+// y coordinates are in montgommery form
+static int Fp2_get_sign(fp2_t y) {
+    sgn0_pty_mont_384x(y, BLS12_381_P, p0);
+}
+
+// reads an Fp^2 element in `a`.
+// input is a serialization of a[1] concatenated to serializetion of a[0].
+// a[i] are both Fp elements.
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid
+//    - BLST_BAD_SCALAR if the scalar isn't in Fp
+//    - BLST_SUCCESS if the scalar is valid 
+static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) {
+    if (len != Fp2_BYTES) {
+        return BLST_BAD_ENCODING;
+    }
+    BLST_ERROR ret = Fp_read_bytes(a[0], bin, Fp_BYTES);
+    if (ret != BLST_SUCCESS) {
+        return ret;
+    }
+    ret = Fp_read_bytes(a[1], bin + Fp_BYTES, Fp_BYTES);
+    if ( ret != BLST_SUCCESS) {
+        return ret;
+    }
+    return BLST_SUCCESS;
+}
+
+// ------------------- G1 utilities
+
 // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form.
 // len is the size of the input buffer.
 //
@@ -600,92 +624,92 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     return RLC_ERR;
 }
 
-
-// returns the sign of y.
-// sign(y_0) if y_1 = 0, else sign(y_1)
-static int fp2_get_sign(fp2_t y) {
-    if (fp_is_zero(y[1])) { // no need to convert back as the montgomery form of 0 is 0
-        return fp_get_sign(y[0]);
-    }
-    return fp_get_sign(y[1]);
-}
-
-// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form.
+// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form.
 // len is the allocated size of the buffer bin.
 // The serialization is following:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-// The code is a modified version of Relic ep2_write_bin 
-void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) {
-    ep2_t t;
-    ep2_null(t);
-    const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1));
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
+// The code is a modified version of Relic ep_write_bin
+void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
+    const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
 
-    if (len!=G2_size) {
+    if (len!=G1_size) {
         RLC_THROW(ERR_NO_BUFFER);
         return;
     }
  
-    if (ep2_is_infty((ep2_st *)a)) {
+    if (ep_is_infty(a)) {
             // set the infinity bit
-            bin[0] = (G2_SERIALIZATION << 7) | 0x40;
-            memset(bin+1, 0, G2_size-1);
+            bin[0] = (G1_SERIALIZATION << 7) | 0x40;
+            memset(bin+1, 0, G1_size-1);
             return;
     }
 
     RLC_TRY {
-        ep2_new(t);
-        ep2_norm(t, (ep2_st *)a);
-        fp2_write_bin(bin, Fp2_BYTES, t->x, 0);
+        ep_t t;
+        ep_null(t);
+        ep_new(t); 
+        ep_norm(t, a);
+        fp_write_bin(bin, Fp_BYTES, t->x);
 
-        if (G2_SERIALIZATION == COMPRESSED) {
-            bin[0] |= (fp2_get_sign(t->y) << 5);
+        if (G1_SERIALIZATION == COMPRESSED) {
+            bin[0] |= (Fp_get_sign(t->y) << 5);
         } else {
-            fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0);
+            fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y);
         }
+        ep_free(t);
     } RLC_CATCH_ANY {
         RLC_THROW(ERR_CAUGHT);
     }
 
-    bin[0] |= (G2_SERIALIZATION << 7);
-    ep_free(t);
+    bin[0] |= (G1_SERIALIZATION << 7);
+ }
+
+// Exponentiation of a generic point p in G1
+void ep_mult(ep_t res, const ep_t p, const Fr *expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
+    // Using window NAF of size 2 
+    ep_mul_lwnaf(res, p, tmp_expo);
+    free(tmp_expo);
 }
 
-// fp2_read_bin_safe is a modified version of Relic's (void fp2_read_bin).
-// It reads an Fp^2 element from a buffer and makes sure the big numbers read can be 
-// written as field elements (are reduced modulo p). 
-// Unlike Relic's versions, the function does not reduce the read integers modulo p and does
-// not throw an exception for integers larger than p. The function returns RLC_OK if the input
-// corresponds to a field element in Fp^2, and returns RLC_ERR otherwise.
-static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) {
-    if (len != Fp2_BYTES) {
-        return RLC_ERR;
-    }
-    if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) {
-        return RLC_ERR;
-    }
-    if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) {
-        return RLC_ERR;
-    }
-    return RLC_OK;
+// Exponentiation of generator g1 in G1
+// These two function are here for bench purposes only
+void ep_mult_gen_bench(ep_t res, const Fr* expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
+    // Using precomputed table of size 4
+    ep_mul_gen(res, tmp_expo);
+    free(tmp_expo);
+}
+
+void ep_mult_generic_bench(ep_t res, const Fr* expo) {
+    // generic point multiplication
+    ep_mult(res, &core_get()->ep_g, expo);
 }
 
-// ep2_read_bin_compact imports a point from a buffer in a compressed or uncompressed form.
+// ------------------- G2 utilities
+
+// G2_read_bytes imports a point from a buffer in a compressed or uncompressed form.
 // The resulting point is guaranteed to be on curve E2.
 //
-// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and read coordinates
-// correspond to a point on curve) and the execution completes and RLC_ERR otherwise.
-// The code is a modified version of Relic ep2_read_bin
-int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
+// reads a scalar in `a` and checks it is a valid Fp element (a < p).
+// input is bytes-big-endian.
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize
+//    - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2
+//    - BLST_SUCCESS if deserialization is valid 
+
+// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z ?
+BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) {
     // check the length
-    const int G2size = (G2_BYTES/(G2_SERIALIZATION+1));
-    if (len!=G2size) {
-        return RLC_ERR;
+    if (len != G2_SER_BYTES) {
+        return BLST_BAD_ENCODING;
     }
 
     // check the compression bit
     int compressed = bin[0] >> 7;
     if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
-        return RLC_ERR;
+        return BLST_BAD_ENCODING;
     } 
 
     // check if the point in infinity
@@ -693,54 +717,129 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
     if (is_infinity) {
         // the remaining bits need to be cleared
         if (bin[0] & 0x3F) {
-            return RLC_ERR;
+            return BLST_BAD_ENCODING;
         }
-        for (int i=1; i<G2size-1; i++) {
+        for (int i=1; i<G2_SER_BYTES-1; i++) {
             if (bin[i]) {
-                return RLC_ERR;
+                return BLST_BAD_ENCODING;
             } 
         }
-		ep2_set_infty(a);
+		G2_set_infty(a);
 		return RLC_OK;
 	} 
 
     // read the sign bit and check for consistency
     int y_sign = (bin[0] >> 5) & 1;
     if (y_sign && (!compressed)) {
-        return RLC_ERR;
+        return BLST_BAD_ENCODING;
     } 
     
-	a->coord = BASIC;
-    fp2_set_dig(a->z, 1);   // a.z
     // use a temporary buffer to mask the header bits and read a.x
     byte temp[Fp2_BYTES];
     memcpy(temp, bin, Fp2_BYTES);
     temp[0] &= 0x1F;        // clear the header bits
-    if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) {
-        return RLC_ERR;
+    BLST_ERROR ret = fp2_read_bytes(a->x, temp, sizeof(temp));
+    if (ret != BLST_SUCCESS) {
+        return ret;
     }
 
+    // set a.z to 1
+    Fp_copy(a->z[0], BLS12_381_pR);
+    Fp_set_zero(a->z[1]);   
+
     if (G2_SERIALIZATION == UNCOMPRESSED) {
-        if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ 
-            return RLC_ERR;
+        ret = fp2_read_bytes(a->y, bin + Fp2_BYTES, sizeof(a->y));
+        if (ret != BLST_SUCCESS){ 
+            return ret;
         }
         // check read point is on curve
-        if (!ep2_on_curve(a)) {
-            return RLC_ERR;
+        if (!G2_on_curve(a)) { 
+            return BLST_POINT_NOT_ON_CURVE;
         }
-        return RLC_OK;
+        return BLST_SUCCESS;
     }
     
-    fp2_zero(a->y);
-    fp_set_bit(a->y[0], 0, y_sign);
-    fp_zero(a->y[1]);
-    if (ep2_upk(a, a) == 1) {
-        // resulting point is guaranteed to be on curve
-        return RLC_OK;
+    // compute the possible square root
+    Fp_to_montg((a->x)[0], a->x[0]);
+    Fp_to_montg(a->x[1], a->x[1]);
+
+    Fp2_squ_montg(a->y, a->x);
+    Fp2_mul_montg(a->y, a->y, a->x);
+    Fp2_add(a->y, a->y, B_E2);                       
+    if (!sqrt_fp2(a->y, a->y))  // (y^2 = x^3+b) has no solution in y
+        return BLST_POINT_NOT_ON_CURVE; 
+
+    // resulting (x,y) is guaranteed to be on curve
+    if (Fp2_get_sign(a->y) != y_sign) {
+        Fp2_neg(a->y, a->y); // flip y sign if needed
     }
-    return RLC_ERR;
+    return BLST_SUCCESS;
+}
+
+// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form.
+// len is the allocated size of the buffer bin.
+// The serialization is following:
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
+// The code is a modified version of Relic ep2_write_bin 
+void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) {
+    ep2_t t;
+    ep2_null(t);
+    const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1));
+
+    if (len!=G2_size) {
+        RLC_THROW(ERR_NO_BUFFER);
+        return;
+    }
+ 
+    if (ep2_is_infty((ep2_st *)a)) {
+            // set the infinity bit
+            bin[0] = (G2_SERIALIZATION << 7) | 0x40;
+            memset(bin+1, 0, G2_size-1);
+            return;
+    }
+
+    RLC_TRY {
+        ep2_new(t);
+        ep2_norm(t, (ep2_st *)a);
+        fp2_write_bin(bin, Fp2_BYTES, t->x, 0);
+
+        if (G2_SERIALIZATION == COMPRESSED) {
+            bin[0] |= (Fp2_get_sign(t->y) << 5);
+        } else {
+            fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0);
+        }
+    } RLC_CATCH_ANY {
+        RLC_THROW(ERR_CAUGHT);
+    }
+
+    bin[0] |= (G2_SERIALIZATION << 7);
+    ep_free(t);
+}
+
+// set p to infinity
+static void G2_set_infty(G2* p) {
+    vec_zero(p, G2_BYTES);
+}
+
+// checks p is on G2
+static bool G2_on_curve(G2* p) {
+    return POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p));
+}
+
+// Exponentiation of a generic point p in G2
+void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
+    bn_st* tmp_expo = Fr_blst_to_relic(expo);
+    // Using window NAF of size 2
+    ep2_mul_lwnaf(res, p, tmp_expo);
+    free(tmp_expo);
 }
 
+// Exponentiation of generator g2 in G2
+void G2_mult_gen(G2* res, const Fr* expo) {
+    pow256 tmp;
+    pow256_from_Fr(tmp, expo);
+    POINTonE2_sign(res, &BLS12_381_G2, tmp);
+}
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
 void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){
@@ -753,6 +852,9 @@ void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){
                             // public key
 }
 
+// ------------------- other
+
+
 // Verifies the validity of 2 SPoCK proofs and 2 public keys.
 // Membership check in G1 of both proofs is verified in this function.
 // Membership check in G2 of both keys is not verified in this function.
@@ -1022,3 +1124,45 @@ void ep2_rand_G2complement(ep2_t p) {
 void xmd_sha256(uint8_t *hash, int len_hash, uint8_t *msg, int len_msg, uint8_t *dst, int len_dst){
     md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst);
 }
+
+
+// DEBUG printing functions 
+void bytes_print_(char* s, byte* data, int len) {
+    printf("[%s]:\n", s);
+    for (int i=0; i<len; i++) 
+        printf("%02x,", data[i]);
+    printf("\n");
+}
+
+void Fr_print_(char* s, Fr* a) {
+    printf("[%s]:\n", s);
+    limb_t* p = (limb_t*)(a) + Fr_LIMBS;
+    for (int i=0; i<Fr_LIMBS; i++) 
+        printf("%16llx", *(--p));
+    printf("\n");
+}
+ 
+
+void fp_print_(char* s, fp_st a) {
+    char* str = malloc(sizeof(char) * fp_size_str(a, 16));
+    fp_write_str(str, 100, a, 16);
+    printf("[%s]:\n%s\n", s, str);
+    free(str);
+}
+
+void bn_print_(char* s, bn_st *a) {
+    char* str = malloc(sizeof(char) * bn_size_str(a, 16));
+    bn_write_str(str, 128, a, 16);
+    printf("[%s]:\n%s\n", s, str);
+    free(str);
+}
+
+void ep_print_(char* s, ep_st* p) {
+    printf("[%s]:\n", s);
+    g1_print(p);
+}
+
+void ep2_print_(char* s, ep2_st* p) {
+    printf("[%s]:\n", s);
+    g2_print(p);
+}
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 6f093d57812..82d13798455 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -162,8 +162,8 @@ func writeScalar(dest []byte, x *scalar) {
 }
 
 // writePointG2 writes a G2 point in a slice of bytes
-// The slice should be of size PubKeyLenBLSBLS12381 and the serialization will
-// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
+// The slice should be of size PubKeyLenBLSBLS12381 and the serialization
+// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointG2(dest []byte, a *pointG2) {
 	C.ep2_write_bin_compact((*C.uchar)(&dest[0]),
 		(*C.ep2_st)(a),
@@ -207,7 +207,7 @@ func readScalarFrStar(a *scalar, src []byte) error {
 // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func readPointG2(a *pointG2, src []byte) error {
-	switch C.ep2_read_bin_compact((*C.ep2_st)(a),
+	switch C.G2_read_bytes((*C.ep2_st)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src))) {
 	case valid:
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index e0c5fed472c..a6e88baa205 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -36,8 +36,10 @@
 // Compressed and uncompressed points
 #define COMPRESSED      1
 #define UNCOMPRESSED    0
-#define G1_SERIALIZATION   COMPRESSED
-#define G2_SERIALIZATION   COMPRESSED
+#define G1_SERIALIZATION    (COMPRESSED)
+#define G2_SERIALIZATION    (COMPRESSED)
+#define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
+#define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
 // Subgroup membership check method
 #define EXP_ORDER 0
@@ -122,7 +124,7 @@ void     seed_relic(byte*, int);
 
 int         ep_read_bin_compact(ep_t, const byte *, const int);
 void        ep_write_bin_compact(byte *, const ep_t,  const int);
-int         ep2_read_bin_compact(ep2_t, const byte *,  const int);
+int         G2_read_bytes(ep2_t, const byte *,  const int);
 void        ep2_write_bin_compact(byte *, const ep2_t,  const int);
 
 
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 9ca0e7a821e..5fdd6db7c79 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -88,7 +88,7 @@ int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len){
     const int size = (G2_BYTES/(G2_SERIALIZATION+1));
     byte* p = (byte*) src;
     for (int i=0; i<len; i++){
-        int read_ret = ep2_read_bin_compact(&A[i], p, size); // returns RLC_OK or RLC_ERR
+        int read_ret = G2_read_bytes(&A[i], p, size); // returns RLC_OK or RLC_ERR
         if (read_ret != RLC_OK)
             return read_ret;
         p += size;

From f025a01688f822094a7cc521190ff9ffc94574e6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 11 Apr 2023 17:39:33 -0600
Subject: [PATCH 026/200] first changes to use new type G2

---
 crypto/bls.go                    |  12 +-
 crypto/bls12381_utils.c          | 298 ++++++++++++++++++++++---------
 crypto/bls12381_utils.go         |  38 ++--
 crypto/bls12381_utils.h          |  55 +++---
 crypto/bls12381_utils_test.go    |  42 ++---
 crypto/bls_core.c                |  27 +--
 crypto/bls_include.h             |  10 +-
 crypto/bls_multisig.go           |  23 ++-
 crypto/bls_test.go               |  26 +--
 crypto/bls_thresholdsign.go      |   4 +-
 crypto/bls_thresholdsign_test.go |   3 +-
 crypto/blst_include.h            |  28 +--
 crypto/{ => blst_src}/blst_src.c |   1 +
 crypto/blst_src/client_min_pk.c  |  17 --
 crypto/blst_src/client_min_sig.c |  17 --
 crypto/dkg.go                    |   3 +
 crypto/dkg_core.c                |  14 +-
 crypto/dkg_feldmanvss.go         |  17 +-
 crypto/dkg_feldmanvssq.go        |   7 +-
 crypto/dkg_include.h             |   2 +-
 crypto/dkg_jointfeldman.go       |  11 +-
 crypto/dkg_test.go               |   3 +-
 crypto/spock.go                  |   6 +-
 crypto/spock_test.go             |   2 +
 crypto/thresholdsign.go          |   2 +
 25 files changed, 409 insertions(+), 259 deletions(-)
 rename crypto/{ => blst_src}/blst_src.c (99%)
 delete mode 100644 crypto/blst_src/client_min_pk.c
 delete mode 100644 crypto/blst_src/client_min_sig.c

diff --git a/crypto/bls.go b/crypto/bls.go
index 447ba6f532e..66f4c809e85 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -211,7 +211,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 		return false, nil
 	}
 
-	verif := C.bls_verify((*C.ep2_st)(&pk.point),
+	verif := C.bls_verify((*C.G2)(&pk.point),
 		(*C.uchar)(&s[0]),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)))
@@ -352,7 +352,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 	}
 
 	// membership check in G2
-	if C.check_membership_G2((*C.ep2_st)(&pk.point)) != valid {
+	if C.G2_check_membership((*C.G2)(&pk.point)) != valid {
 		return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group")
 	}
 
@@ -498,15 +498,15 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
 	if serializationG2 != compressed {
 		panic("library is not configured to use compressed public key serialization")
 	}
-	return a.Encode()
+	dest := make([]byte, pubKeyLengthBLSBLS12381)
+	writePointG2(dest, &a.point)
+	return dest
 }
 
 // Encode returns a byte encoding of the public key.
 // Since we use a compressed encoding by default, this delegates to EncodeCompressed
 func (a *pubKeyBLSBLS12381) Encode() []byte {
-	dest := make([]byte, pubKeyLengthBLSBLS12381)
-	writePointG2(dest, &a.point)
-	return dest
+	return a.EncodeCompressed()
 }
 
 // Equals checks is two public keys are equal
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 3d2a1b99f6a..b66be0932a2 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -8,6 +8,8 @@
 #include "bls_include.h"
 #include "assert.h"
 
+#include "blst_src.c"
+
 // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
 
 // return macro values to the upper Go Layer
@@ -81,8 +83,13 @@ void precomputed_data_set(const prec_st* p) {
     bls_prec = (prec_st*)p;
 }
 
+// Reads a prime field element from a digit vector in big endian format.
+// There is no conversion to Montgomery domain in this function.
+#define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
+
 // pre-compute some data required for curve BLS12-381
 prec_st* init_precomputed_data_BLS12_381() {
+
     bls_prec = &bls_prec_st;
     ctx_t* ctx = core_get();
 
@@ -375,21 +382,11 @@ void Fp_set_limb(Fp* a, const limb_t l){
     *((limb_t*)a) = l;
 }
 
-static bool check_Fp(byte *in) {
-    // use same method as in BLST internal function
-    // which seems the most efficient. The method uses the assembly-based 
-    // modular addition instead of limbs comparison
-    vec384 temp;
-    add_fp(temp, in, ZERO_384); 
-    return vec_is_equal(temp, in, Fp_BYTES);
-    // no need to clear `tmp` as no use-case involves sensitive data being passed as `in`
-}
-
 void Fp_copy(Fp* res, const Fp* a) {
     vec_copy((byte*)res, (byte*)a, Fp_BYTES);
 }
 
-void Fp_add(Fp *res, const Fp *a, const Fp *b) {
+static void Fp_add(Fp *res, const Fp *a, const Fp *b) {
     add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
 }
 
@@ -401,6 +398,16 @@ void Fp_neg(Fp *res, const Fp *a) {
     cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P);
 }
 
+static bool check_Fp(const Fp* in) {
+    // use same method as in BLST internal function
+    // which seems the most efficient. The method uses the assembly-based 
+    // modular addition instead of limbs comparison
+    Fp temp;
+    Fp_add(&temp, in, &ZERO_384); 
+    return vec_is_equal(&temp, in, Fp_BYTES);
+    // no need to clear `tmp` as no use-case involves sensitive data being passed as `in`
+}
+
 // res = a*b*R^(-1)
 void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) {
     mul_mont_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P, p0);
@@ -439,6 +446,12 @@ BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
     return BLST_SUCCESS;
 }
 
+
+// write Fp element to bin and assume `bin` has  `Fp_BYTES` allocated bytes.  
+void Fp_write_bytes(byte *bin, const Fp* a) {
+    be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES);
+}
+
 // fp_read_bin_safe is a modified version of Relic's (void fp_read_bin).
 // It reads a field element from a buffer and makes sure the big number read can be 
 // written as a field element (is reduced modulo p). 
@@ -477,57 +490,59 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
     return ret;
 }
 
-// Reads a prime field element from a digit vector in big endian format.
-// There is no conversion to Montgomery domain in this function.
- #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
-
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
-static int Fp_get_sign(const fp_t y) {
-    sgn0_pty_mont_384(y, BLS12_381_P, p0);
+static limb_t Fp_get_sign(const fp_t y) {
+    return sgn0_pty_mont_384(y, BLS12_381_P, p0);
 }
 
 // ------------------- Fp^2 utilities
 
 // sets `a` to limb `l`
 void Fp2_set_limb(Fp2* a, const limb_t l){
-    Fp_set_limb(a[0], l);  // TODO: check!!
-    Fp_set_zero(a[1]);
+    Fp_set_limb(&real(a), l);  
+    Fp_set_zero(&imag(a));
 }
 
 void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    add_mod_384x(res, a, b, BLS12_381_P);
+    add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
 }
 
 void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    sub_mod_384x(res, a, b, BLS12_381_P);
+    sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
 }
 
 void Fp2_neg(Fp2 *res, const Fp2 *a) {
-    cneg_mod_384(res[0], a[0], 1, BLS12_381_P);
-    cneg_mod_384(res[1], a[1], 1, BLS12_381_P);
+    cneg_mod_384(real(res), real(a), 1, BLS12_381_P);
+    cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P);
 }
 
 // res = a*b in montgomery form
 void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    mul_mont_384x(res, a, b, BLS12_381_P, p0); 
+    mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); 
 }
 
 // res = a^2 in montgomery form
 void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
-    sqr_mont_384x(res, a, BLS12_381_P, p0); 
+    sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); 
+}
+
+// checks if `a` is a quadratic residue in Fp^2. If yes, it computes 
+// the square root in `res`.
+static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) {
+   return sqrt_fp2((vec384*)res, (vec384*)a);
 }
 
 // returns the sign of y.
 // sign(y_0) if y_1 = 0, else sign(y_1)
 // y coordinates are in montgommery form
-static int Fp2_get_sign(fp2_t y) {
-    sgn0_pty_mont_384x(y, BLS12_381_P, p0);
+static limb_t Fp2_get_sign(Fp2* y) {
+    return sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0);
 }
 
 // reads an Fp^2 element in `a`.
-// input is a serialization of a[1] concatenated to serializetion of a[0].
+// input is a serialization of real(a) concatenated to serializetion of imag(a).
 // a[i] are both Fp elements.
 // returns:
 //    - BLST_BAD_ENCODING if the length is invalid
@@ -537,17 +552,23 @@ static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) {
     if (len != Fp2_BYTES) {
         return BLST_BAD_ENCODING;
     }
-    BLST_ERROR ret = Fp_read_bytes(a[0], bin, Fp_BYTES);
+    BLST_ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
     if (ret != BLST_SUCCESS) {
         return ret;
     }
-    ret = Fp_read_bytes(a[1], bin + Fp_BYTES, Fp_BYTES);
+    ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES);
     if ( ret != BLST_SUCCESS) {
         return ret;
     }
     return BLST_SUCCESS;
 }
 
+// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes.  
+void Fp2_write_bytes(byte *bin, const Fp2* a) {
+    Fp_write_bytes(bin, &real(a));
+    Fp_write_bytes(bin + Fp_BYTES, &imag(a));
+}
+
 // ------------------- G1 utilities
 
 // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form.
@@ -686,10 +707,101 @@ void ep_mult_generic_bench(ep_t res, const Fr* expo) {
     ep_mult(res, &core_get()->ep_g, expo);
 }
 
-// ------------------- G2 utilities
+// ------------------- E2 utilities
+
+// TODO: to delete
+static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) {
+    if (len != Fp2_BYTES) {
+        return RLC_ERR;
+    }
+    if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) {
+        return RLC_ERR;
+    }
+    if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) {
+        return RLC_ERR;
+    }
+    return RLC_OK;
+}
+
+// TODO: to delete, only used by temporary E2_blst_to_relic
+static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
+    // check the length
+    const int G2size = (G2_BYTES/(G2_SERIALIZATION+1));
+    if (len!=G2size) {
+        return RLC_ERR;
+    }
+
+    // check the compression bit
+    int compressed = bin[0] >> 7;
+    if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
+        return RLC_ERR;
+    } 
+
+    // check if the point in infinity
+    int is_infinity = bin[0] & 0x40;
+    if (is_infinity) {
+        // the remaining bits need to be cleared
+        if (bin[0] & 0x3F) {
+            return RLC_ERR;
+        }
+        for (int i=1; i<G2size-1; i++) {
+            if (bin[i]) {
+                return RLC_ERR;
+            } 
+        }
+		ep2_set_infty(a);
+		return RLC_OK;
+	} 
 
-// G2_read_bytes imports a point from a buffer in a compressed or uncompressed form.
-// The resulting point is guaranteed to be on curve E2.
+    // read the sign bit and check for consistency
+    int y_sign = (bin[0] >> 5) & 1;
+    if (y_sign && (!compressed)) {
+        return RLC_ERR;
+    } 
+    
+	a->coord = BASIC;
+    fp2_set_dig(a->z, 1);   // a.z
+    // use a temporary buffer to mask the header bits and read a.x
+    byte temp[Fp2_BYTES];
+    memcpy(temp, bin, Fp2_BYTES);
+    temp[0] &= 0x1F;        // clear the header bits
+    if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) {
+        return RLC_ERR;
+    }
+
+    if (G2_SERIALIZATION == UNCOMPRESSED) {
+        if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ 
+            return RLC_ERR;
+        }
+        // check read point is on curve
+        if (!ep2_on_curve(a)) {
+            return RLC_ERR;
+        }
+        return RLC_OK;
+    }
+    
+    fp2_zero(a->y);
+    fp_set_bit(a->y[0], 0, y_sign);
+    fp_zero(a->y[1]);
+    if (ep2_upk(a, a) == 1) {
+        // resulting point is guaranteed to be on curve
+        return RLC_OK;
+    }
+    return RLC_ERR;
+}
+
+// TODO: temp utility function to delete
+ep2_st* E2_blst_to_relic(const G2* x) {
+    ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); 
+    byte* data = (byte*)malloc(G2_SER_BYTES);
+    E2_write_bytes(data, x);
+    ep2_read_bin_compact(out, data, G2_SER_BYTES);
+    free(data);
+    return out;
+}
+
+// E2_read_bytes imports a point from a buffer in a compressed or uncompressed form.
+// The resulting point is guaranteed to be on curve E2 (no G2 check is included)
 //
 // reads a scalar in `a` and checks it is a valid Fp element (a < p).
 // input is bytes-big-endian.
@@ -699,8 +811,9 @@ void ep_mult_generic_bench(ep_t res, const Fr* expo) {
 //    - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2
 //    - BLST_SUCCESS if deserialization is valid 
 
-// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z ?
-BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) {
+// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, 
+//       and update logic with G2 subgroup check?
+BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) {
     // check the length
     if (len != G2_SER_BYTES) {
         return BLST_BAD_ENCODING;
@@ -724,7 +837,7 @@ BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) {
                 return BLST_BAD_ENCODING;
             } 
         }
-		G2_set_infty(a);
+		E2_set_infty(a);
 		return RLC_OK;
 	} 
 
@@ -738,92 +851,113 @@ BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) {
     byte temp[Fp2_BYTES];
     memcpy(temp, bin, Fp2_BYTES);
     temp[0] &= 0x1F;        // clear the header bits
-    BLST_ERROR ret = fp2_read_bytes(a->x, temp, sizeof(temp));
+    BLST_ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
     if (ret != BLST_SUCCESS) {
         return ret;
     }
 
     // set a.z to 1
-    Fp_copy(a->z[0], BLS12_381_pR);
-    Fp_set_zero(a->z[1]);   
+    Fp2* a_z = &(a->z);
+    Fp_copy(&real(a_z), &BLS12_381_pR);
+    Fp_set_zero(&imag(a_z));   
 
     if (G2_SERIALIZATION == UNCOMPRESSED) {
-        ret = fp2_read_bytes(a->y, bin + Fp2_BYTES, sizeof(a->y));
+        ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y));
         if (ret != BLST_SUCCESS){ 
             return ret;
         }
         // check read point is on curve
-        if (!G2_on_curve(a)) { 
+        if (!E2_affine_on_curve(a)) { 
             return BLST_POINT_NOT_ON_CURVE;
         }
         return BLST_SUCCESS;
     }
     
     // compute the possible square root
-    Fp_to_montg((a->x)[0], a->x[0]);
-    Fp_to_montg(a->x[1], a->x[1]);
-
-    Fp2_squ_montg(a->y, a->x);
-    Fp2_mul_montg(a->y, a->y, a->x);
-    Fp2_add(a->y, a->y, B_E2);                       
-    if (!sqrt_fp2(a->y, a->y))  // (y^2 = x^3+b) has no solution in y
+    Fp2* a_x = &(a->x);
+    Fp_to_montg(&real(a_x), &real(a_x));
+    Fp_to_montg(&imag(a_x), &imag(a_x));
+
+    Fp2* a_y = &(a->y);
+    Fp2_squ_montg(a_y, a_x);
+    Fp2_mul_montg(a_y, a_y, a_x);
+    Fp2_add(a_y, a_y, &B_E2);                       
+    if (!Fp2_sqrt(a_y, a_y))  // if (y^2 = x^3+b) has no solution in y
         return BLST_POINT_NOT_ON_CURVE; 
 
     // resulting (x,y) is guaranteed to be on curve
-    if (Fp2_get_sign(a->y) != y_sign) {
-        Fp2_neg(a->y, a->y); // flip y sign if needed
+    if (Fp2_get_sign(a_y) != y_sign) {
+        Fp2_neg(a_y, a_y); // flip y sign if needed
     }
     return BLST_SUCCESS;
 }
 
-// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form.
-// len is the allocated size of the buffer bin.
-// The serialization is following:
+// E2_write_bytes exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form.
+// It assumes buffer is of length G2_SER_BYTES
+// The serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
 // The code is a modified version of Relic ep2_write_bin 
-void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) {
-    ep2_t t;
-    ep2_null(t);
-    const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1));
-
-    if (len!=G2_size) {
-        RLC_THROW(ERR_NO_BUFFER);
-        return;
-    }
- 
-    if (ep2_is_infty((ep2_st *)a)) {
+void E2_write_bytes(byte *bin, const G2* a) {
+    if (E2_is_infty(a)) {
             // set the infinity bit
             bin[0] = (G2_SERIALIZATION << 7) | 0x40;
-            memset(bin+1, 0, G2_size-1);
+            memset(bin+1, 0, G2_SER_BYTES-1);
             return;
     }
 
-    RLC_TRY {
-        ep2_new(t);
-        ep2_norm(t, (ep2_st *)a);
-        fp2_write_bin(bin, Fp2_BYTES, t->x, 0);
+    G2 tmp;
+    E2_to_affine(&tmp, a);
+    Fp2* t_x = &(tmp.x);
+    Fp_from_montg(&real(t_x), &real(t_x));
+    Fp_from_montg(&imag(t_x), &imag(t_x));
+    Fp2_write_bytes(bin, t_x);
 
-        if (G2_SERIALIZATION == COMPRESSED) {
-            bin[0] |= (Fp2_get_sign(t->y) << 5);
-        } else {
-            fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0);
-        }
-    } RLC_CATCH_ANY {
-        RLC_THROW(ERR_CAUGHT);
+    Fp2* t_y = &(tmp.y);
+    if (G2_SERIALIZATION == COMPRESSED) {
+        bin[0] |= (Fp2_get_sign(t_y) << 5);
+    } else {
+        Fp2_write_bytes(bin + Fp2_BYTES, t_y);
     }
 
     bin[0] |= (G2_SERIALIZATION << 7);
-    ep_free(t);
 }
 
 // set p to infinity
-static void G2_set_infty(G2* p) {
+void E2_set_infty(G2* p) {
     vec_zero(p, G2_BYTES);
 }
 
-// checks p is on G2
-static bool G2_on_curve(G2* p) {
-    return POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p));
+// check if `p` is infinity
+bool_t E2_is_infty(const G2* p) {
+    return vec_is_zero(p, sizeof(*p));
+}
+
+// checks affine point `p` is in E2
+bool_t E2_affine_on_curve(const G2* p) {
+    // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, 
+    // unlike what the function name means.
+    return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p);
+}
+
+// checks p1 == p2
+bool_t E2_is_equal(const G2* p1, const G2* p2) {
+    // `POINTonE2_is_equal` includes the infinity case
+    return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2);
+}
+
+// converts an E2 point from Jacobian into affine coordinates (z=1)
+void E2_to_affine(G2* res, const G2* p) {
+    // minor optimization in case coordinates are already affine
+    if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) {
+        vec_copy(res, p, G2_BYTES);
+        return;
+    }
+    // convert from Jacobian
+    POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p);   
+}
+
+void E2_add(G2* res, const G2* a, const G2* b) {
+    POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); 
 }
 
 // Exponentiation of a generic point p in G2
@@ -838,7 +972,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
 void G2_mult_gen(G2* res, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
-    POINTonE2_sign(res, &BLS12_381_G2, tmp);
+    POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp);
 }
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 82d13798455..9d59eb8d7d4 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -45,9 +45,10 @@ var invalid = C.get_invalid()
 
 // get some constants from the C layer
 // var blst_errors = C.blst_get_errors()
-var blst_valid = (int)(C.BLST_SUCCESS)             //int(blst_errors[0])
-var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) // int(blst_errors[0])
-var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR)     // int(blst_errors[0])
+var blst_valid = (int)(C.BLST_SUCCESS)
+var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING)
+var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR)
+var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE)
 
 // initContext sets relic B12_381 parameters and precomputes some data in the C layer
 func (ct *ctx) initContext() error {
@@ -107,7 +108,7 @@ func (x *scalar) equals(other *scalar) bool {
 
 // comparison in G2
 func (p *pointG2) equals(other *pointG2) bool {
-	return C.ep2_cmp((*C.ep2_st)(p), (*C.ep2_st)(other)) == valid
+	return C.E2_is_equal((*C.G2)(p), (*C.G2)(other)) != 0
 }
 
 // Comparison to zero in Fr.
@@ -118,7 +119,7 @@ func (x *scalar) isZero() bool {
 
 // Comparison to point at infinity in G2.
 func (p *pointG2) isInfinity() bool {
-	return C.ep2_is_infty((*C.ep2_st)(p)) == 1
+	return C.E2_is_infty((*C.G2)(p)) != 10
 }
 
 // returns a random element of Fr in input pointer
@@ -165,9 +166,8 @@ func writeScalar(dest []byte, x *scalar) {
 // The slice should be of size PubKeyLenBLSBLS12381 and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointG2(dest []byte, a *pointG2) {
-	C.ep2_write_bin_compact((*C.uchar)(&dest[0]),
-		(*C.ep2_st)(a),
-		(C.int)(pubKeyLengthBLSBLS12381),
+	C.E2_write_bytes((*C.uchar)(&dest[0]),
+		(*C.G2)(a),
 	)
 }
 
@@ -207,13 +207,17 @@ func readScalarFrStar(a *scalar, src []byte) error {
 // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func readPointG2(a *pointG2, src []byte) error {
-	switch C.G2_read_bytes((*C.ep2_st)(a),
+	read := C.E2_read_bytes((*C.G2)(a),
 		(*C.uchar)(&src[0]),
-		(C.int)(len(src))) {
-	case valid:
+		(C.int)(len(src)))
+
+	switch int(read) {
+	case blst_valid:
 		return nil
-	case invalid:
-		return invalidInputsErrorf("input is not a G2 point")
+	case blst_bad_encoding, blst_bad_scalar:
+		return invalidInputsErrorf("input could not deserialize to a G2 point")
+	case blst_point_not_on_curve:
+		return invalidInputsErrorf("input is not a point on curve E2")
 	default:
 		return errors.New("reading a G2 point failed")
 	}
@@ -244,7 +248,7 @@ func checkMembershipG1(pt *pointG1) int {
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
 // in go test files.
 func checkMembershipG2(pt *pointG2) int {
-	return int(C.check_membership_G2((*C.ep2_st)(pt)))
+	return int(C.G2_check_membership((*C.G2)(pt)))
 }
 
 // randPointG1 wraps a call to C since cgo can't be used in go test files.
@@ -259,17 +263,19 @@ func randPointG1Complement(pt *pointG1) {
 	C.ep_rand_G1complement((*C.ep_st)(pt))
 }
 
+/*
 // randPointG2 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G2 and stores it in input point.
 func randPointG2(pt *pointG2) {
-	C.ep2_rand_G2((*C.ep2_st)(pt))
+	C.ep2_rand_G2((*C.G2)(pt))
 }
 
 // randPointG1Complement wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
 func randPointG2Complement(pt *pointG2) {
-	C.ep2_rand_G2complement((*C.ep2_st)(pt))
+	C.ep2_rand_G2complement((*C.G2)(pt))
 }
+*/
 
 // This is only a TEST function.
 // It hashes `data` to a G1 point using the tag `dst` and returns the G1 point serialization.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index a6e88baa205..a6688c5871d 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -82,6 +82,7 @@ typedef struct prec_ {
 
 // TODO: to delete when Relic is removed
 bn_st* Fr_blst_to_relic(const Fr* x);
+ep2_st* E2_blst_to_relic(const G2* x);
 
 int      get_valid();
 int      get_invalid();
@@ -116,45 +117,49 @@ BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
 void        Fr_write_bytes(uint8_t *bin, const Fr* a);
 bool        map_bytes_to_Fr(Fr*, const uint8_t*, int);
 
-// Utility functions
-ctx_t*   relic_init_BLS12_381();
-prec_st* init_precomputed_data_BLS12_381();
-void     precomputed_data_set(const prec_st* p);
-void     seed_relic(byte*, int);
-
-int         ep_read_bin_compact(ep_t, const byte *, const int);
-void        ep_write_bin_compact(byte *, const ep_t,  const int);
-int         G2_read_bytes(ep2_t, const byte *,  const int);
-void        ep2_write_bin_compact(byte *, const ep2_t,  const int);
-
-
-
+// Fp utilities
 
+// E1 and G1 utilities
+int      ep_read_bin_compact(ep_t, const byte *, const int);
+void     ep_write_bin_compact(byte *, const ep_t,  const int);
 void     ep_mult_gen_bench(ep_t, const Fr*);
 void     ep_mult_generic_bench(ep_t, const Fr*);
 void     ep_mult(ep_t, const ep_t, const Fr*);
-void     G2_mult_gen(ep2_t, const Fr*);
-void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
-
 void     ep_sum_vector(ep_t, ep_st*, const int);
-void     ep2_sum_vector(ep2_t, ep2_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
-void     ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len);
-
-// membership checks
 int      check_membership_G1(const ep_t);
-int      check_membership_G2(const ep2_t);
-
 int      simple_subgroup_check_G1(const ep_t);
-int      simple_subgroup_check_G2(const ep2_t);
 void     ep_rand_G1(ep_t);
 void     ep_rand_G1complement( ep_t);
-void     ep2_rand_G2(ep2_t);
-void     ep2_rand_G2complement( ep2_t);
 #if  (MEMBERSHIP_CHECK_G1 == BOWE)
 int      bowe_subgroup_check_G1(const ep_t);
 #endif
 
+// E2 and G2 utilities
+void        E2_set_infty(G2* p);
+bool_t      E2_is_infty(const G2*);
+bool_t      E2_affine_on_curve(const G2*);
+bool_t      E2_is_equal(const G2* p1, const G2* p2);
+void        E2_to_affine(G2*, const G2*);
+BLST_ERROR  E2_read_bytes(G2*, const byte *,  const int); 
+void        E2_write_bytes(byte *, const G2*);
+void        G2_mult_gen(G2*, const Fr*);
+void        E2_add(G2* res, const G2* a, const G2* b);
+
+void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
+void     ep2_sum_vector(ep2_t, ep2_st*, const int);
+void     ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len);
+int      G2_check_membership(const G2*);
+int      simple_subgroup_check_G2(const ep2_t);
+void     ep2_rand_G2(ep2_t);
+void     ep2_rand_G2complement( ep2_t);
+
+// Utility functions
+ctx_t*   relic_init_BLS12_381();
+prec_st* init_precomputed_data_BLS12_381();
+void     precomputed_data_set(const prec_st* p);
+void     seed_relic(byte*, int);
+
 // utility testing function
 void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int);
 
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index e7dba41a8eb..e8b34cbb052 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -116,16 +116,17 @@ func TestSubgroupCheck(t *testing.T) {
 		res = checkMembershipG1(&p)
 		assert.Equal(t, res, int(invalid))
 	})
-
-	t.Run("G2", func(t *testing.T) {
-		var p pointG2
-		randPointG2(&p) // point in G2
-		res := checkMembershipG2(&p)
-		assert.Equal(t, res, int(valid))
-		randPointG2Complement(&p) // point in E2\G2
-		res = checkMembershipG2(&p)
-		assert.Equal(t, res, int(invalid))
-	})
+	/*
+		t.Run("G2", func(t *testing.T) {
+			var p pointG2
+			randPointG2(&p) // point in G2
+			res := checkMembershipG2(&p)
+			assert.Equal(t, res, int(valid))
+			randPointG2Complement(&p) // point in E2\G2
+			res = checkMembershipG2(&p)
+			assert.Equal(t, res, int(invalid))
+		})
+	*/
 }
 
 // subgroup membership check bench
@@ -140,14 +141,15 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 		}
 		b.StopTimer()
 	})
-
-	b.Run("G2", func(b *testing.B) {
-		var p pointG2
-		randPointG2(&p)
-		b.ResetTimer()
-		for i := 0; i < b.N; i++ {
-			_ = checkMembershipG2(&p) // G2
-		}
-		b.StopTimer()
-	})
+	/*
+		b.Run("G2", func(b *testing.B) {
+			var p pointG2
+			randPointG2(&p)
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_ = checkMembershipG2(&p) // G2
+			}
+			b.StopTimer()
+		})
+	*/
 }
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index a1d47c73f17..03fa21ca782 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -40,10 +40,10 @@ int check_membership_G1(const ep_t p){
 // 
 // membership check in G2 is using a scalar multiplication by the group order.
 // TODO: switch to the faster Bowe check 
-int check_membership_G2(const ep2_t p){
+int G2_check_membership(const G2* p){
 #if MEMBERSHIP_CHECK
     // check p is on curve
-    if (!ep2_on_curve((ep2_st*)p))
+    if (!E2_affine_on_curve(p))  // TODO: remove and assume inputs are on curve?
         return INVALID;
     // check p is in G2
     #if MEMBERSHIP_CHECK_G2 == EXP_ORDER
@@ -84,7 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 // and a message data.
 // The signature and public key are assumed to be in G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) {     
+static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int len) {     
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -97,9 +97,11 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i
     // hash to G1 
     map_to_G1(elemsG1[1], data, len); 
 
+    ep2_st* pk_tmp = E2_blst_to_relic(pk);
+
     // elemsG2[1] = pk
     ep2_new(elemsG2[1]);
-    ep2_copy(elemsG2[1], (ep2_st*)pk); 
+    ep2_copy(elemsG2[1], (ep2_st*)pk_tmp); 
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -126,6 +128,7 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i
     ep_free(elemsG1[1]);
     ep2_free(elemsG2[0]);
     ep2_free(elemsG2[1]);
+    free(pk_tmp);
     
     if (core_get()->code == RLC_OK) {
         if (res == RLC_EQ) return VALID;
@@ -326,7 +329,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications using the same key.
-int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) {  
+int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) {  
     ep_t s;
     ep_new(s);
     
@@ -343,6 +346,7 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len)
     
     return bls_verify_ep(pk, s, data, len);
 }
+/*
 
 // binary tree structure to be used by bls_batch verify.
 // Each node contains a signature and a public key, the signature (resp. the public key) 
@@ -350,15 +354,15 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len)
 // The leaves contain the initial signatures and public keys.
 typedef struct st_node { 
     ep_st* sig;
-    ep2_st* pk;  
+    G2* pk;  
     struct st_node* left; 
     struct st_node* right; 
 } node;
 
-static node* new_node(const ep2_st* pk, const ep_st* sig){
+static node* new_node(const G2* pk, const ep_st* sig){
     node* t = (node*) malloc(sizeof(node));
     if (t) {
-        t->pk = (ep2_st*)pk;
+        t->pk = (G2*)pk;
         t->sig = (ep_st*)sig;
         t->right = t->left = NULL;
     }
@@ -374,7 +378,6 @@ static void free_tree(node* root) {
                         //  the recursive build starts with the left side first
         // relic free 
         if (root->sig) ep_free(root->sig);
-        if (root->pk) ep2_free(root->pk);
         // pointer free
         free(root->sig);
         free(root->pk);
@@ -397,7 +400,7 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) {
     int left_len = len - right_len;
 
     // create a new node with new points
-    ep2_st* new_pk = (ep2_st*)malloc(sizeof(ep2_st));
+    G2* new_pk = (G2*)malloc(sizeof(G2));
     if (!new_pk) goto error;
     ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st));
     if (!new_sig) goto error_sig;
@@ -405,7 +408,6 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) {
     node* t = new_node(new_pk, new_sig);
     if (!t) goto error_node;
     ep_new(t->sig);
-    ep2_new(t->pk);
 
     // build the tree in a top-down way
     t->left = build_tree(left_len, &pks[0], &sigs[0]);
@@ -415,7 +417,7 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) {
     if (!t->right) { free_tree(t); goto error; }
     // sum the children
     ep_add_jacob(t->sig, t->left->sig, t->right->sig);
-    ep2_add_projc(t->pk, t->left->pk, t->right->pk); 
+    E2_add(t->pk, t->left->pk, t->right->pk); 
     return t;
 
 error_node:
@@ -522,3 +524,4 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input,
 out_sigs:
     free(pks);
 }
+*/
\ No newline at end of file
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 0e965bac88e..25bdf2020a7 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -21,13 +21,13 @@
 #define SINGLE_PAIRING (DOUBLE_PAIRING^1)
 
 // Signature and public key membership check
-#define MEMBERSHIP_CHECK 1
+#define MEMBERSHIP_CHECK 0  // TODO: switch to 1 and clean up memb check
 
-// algorithm choice for the hashing to G1 
-// both methods are similar implementations of the same optimzed SSWU 
+// algorithm choice for hashing to G1 
+// both methods are similar implementations of the same optimized SSWU 
 // but offer different timings.
 #define RELIC_SSWU 1  // relic library implementation
-#define LOCAL_SSWU 2       // local implementation
+#define LOCAL_SSWU 2       // local implementation 
 #define hashToPoint LOCAL_SSWU
 
 // bls core (functions in bls_core.c)
@@ -36,7 +36,7 @@ int      get_pk_len();
 int      get_sk_len();  
 
 void     bls_sign(byte*, const Fr*, const byte*, const int);
-int      bls_verify(const ep2_t, const byte*, const byte*, const int);
+int      bls_verify(const G2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
                          const uint32_t*, const ep2_st*);
 int      bls_verifyPerDistinctKey(const byte*, 
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index b4fa5918ef7..cf293726112 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"errors"
 	"fmt"
@@ -13,7 +14,7 @@ import (
 
 	"github.com/onflow/flow-go/crypto/hash"
 	_ "github.com/onflow/flow-go/crypto/hash"
-)
+)*/
 
 // BLS multi-signature using BLS12-381 curve
 // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381)
@@ -43,6 +44,7 @@ import "C"
 // used for signatures.
 var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite)
 
+/*
 // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key.
 //
 // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used
@@ -193,13 +195,13 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 	}
 
 	var sum pointG2
-	C.ep2_sum_vector((*C.ep2_st)(&sum), (*C.ep2_st)(&points[0]),
+	C.ep2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]),
 		(C.int)(len(points)))
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
 	return sumKey, nil
 }
-
+*/
 // IdentityBLSPublicKey returns an identity public key which corresponds to the point
 // at infinity in G2 (identity element of G2).
 // TODO: return a constant key instead of a newly allocated one
@@ -207,11 +209,13 @@ func IdentityBLSPublicKey() PublicKey {
 
 	identity := *newPubKeyBLSBLS12381(nil)
 	// set the point to infinity
-	C.ep2_set_infty((*C.ep2_st)(&identity.point))
+	C.E2_set_infty((*C.G2)(&identity.point))
 	identity.isIdentity = true
 	return &identity
 }
 
+/*
+
 // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key.
 //
 // The common use case assumes the aggregated public key was initially formed using
@@ -248,8 +252,8 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey,
 	}
 
 	var resultPoint pointG2
-	C.ep2_subtract_vector((*C.ep2_st)(&resultPoint), (*C.ep2_st)(&aggPKBLS.point),
-		(*C.ep2_st)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract)))
+	C.ep2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point),
+		(*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract)))
 
 	resultKey := newPubKeyBLSBLS12381(&resultPoint)
 	return resultKey, nil
@@ -403,7 +407,7 @@ func VerifyBLSSignatureManyMessages(
 			(*C.uchar)(&flatDistinctHashes[0]),
 			(*C.uint32_t)(&lenHashes[0]),
 			(*C.uint32_t)(&pkPerHash[0]),
-			(*C.ep2_st)(&allPks[0]),
+			(*C.G2)(&allPks[0]),
 		)
 
 	} else {
@@ -425,7 +429,7 @@ func VerifyBLSSignatureManyMessages(
 		verif = C.bls_verifyPerDistinctKey(
 			(*C.uchar)(&s[0]),
 			(C.int)(len(mapPerPk)),
-			(*C.ep2_st)(&distinctPks[0]),
+			(*C.G2)(&distinctPks[0]),
 			(*C.uint32_t)(&hashPerPk[0]),
 			(*C.uchar)(&flatHashes[0]),
 			(*C.uint32_t)(&lenHashes[0]))
@@ -521,7 +525,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 	C.bls_batchVerify(
 		(C.int)(len(verifInt)),
 		(*C.uchar)(&verifInt[0]),
-		(*C.ep2_st)(&pkPoints[0]),
+		(*C.G2)(&pkPoints[0]),
 		(*C.uchar)(&flatSigs[0]),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)),
@@ -570,3 +574,4 @@ var invalidSignatureError = errors.New("input signature does not deserialize to
 func IsInvalidSignatureError(err error) bool {
 	return errors.Is(err, invalidSignatureError)
 }
+*/
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index a9672b8eeb7..7a93dd04998 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -6,10 +6,10 @@ package crypto
 import (
 	"crypto/rand"
 	"encoding/hex"
-	"fmt"
+	_ "fmt"
 	mrand "math/rand"
 	"testing"
-	"time"
+	_ "time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -18,7 +18,7 @@ import (
 )
 
 // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify)
-func TestBLSMainMethods(t *testing.T) {
+/*func TestBLSMainMethods(t *testing.T) {
 	// test the key generation seed lengths
 	testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen)
 	// test the consistency with different inputs
@@ -63,7 +63,7 @@ func BenchmarkBLSBLS12381Sign(b *testing.B) {
 func BenchmarkBLSBLS12381Verify(b *testing.B) {
 	halg := NewExpandMsgXOFKMAC128("bench tag")
 	benchVerify(b, BLSBLS12381, halg)
-}
+}*/
 
 // utility function to generate a random BLS private key
 func randomSK(t *testing.T, seed []byte) PrivateKey {
@@ -122,14 +122,14 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		h := internalExpandMsgXOFKMAC128(blsSigCipherSuite)
 		assert.NotNil(t, h)
 	})
-
-	t.Run("constants sanity check", func(t *testing.T) {
-		// test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve
-		// The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long
-		assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16)
-		assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
-	})
-
+	/*
+		t.Run("constants sanity check", func(t *testing.T) {
+			// test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve
+			// The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long
+			assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16)
+			assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
+		})
+	*/
 	t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
 		data := []byte("random_data")
 		// empty tag hasher
@@ -214,6 +214,7 @@ func TestBLSUtils(t *testing.T) {
 	testKeySize(t, sk, PrKeyLenBLSBLS12381, PubKeyLenBLSBLS12381)
 }
 
+/*
 // BLS Proof of Possession test
 func TestBLSPOP(t *testing.T) {
 	r := time.Now().UnixNano()
@@ -1111,3 +1112,4 @@ func TestBLSIdentity(t *testing.T) {
 		assert.False(t, valid)
 	})
 }
+*/
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index e6c21004193..094f4ebc692 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
 import "C"
@@ -581,7 +582,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	for i := index(1); int(i) <= size; i++ {
 		C.Fr_polynomialImage(
 			(*C.Fr)(&x[i-1]),
-			(*C.ep2_st)(&y[i-1]),
+			(*C.G2)(&y[i-1]),
 			(*C.Fr)(&a[0]), (C.int)(len(a)),
 			(C.uint8_t)(i),
 		)
@@ -603,3 +604,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	// generating an identity key is therefore negligible.
 	return skShares, pkShares, pkGroup, nil
 }
+*/
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 0d7f7204a79..04fe28d4db4 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"crypto/rand"
 	"fmt"
@@ -647,4 +648,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) {
 		require.NoError(b, err)
 	}
 	b.StopTimer()
-}
+}*/
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 0ee8e99ddb2..d33ec372be6 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -7,7 +7,9 @@
 // eventually this file would replace blst.h
 
 #include "point.h"
+#include "fields.h"
 #include "consts.h"
+#include "errors.h"
 #include "bls12381_utils.h"
 
 // types used by the Flow crypto library that are imported from BLST
@@ -55,16 +57,9 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
 # define DEFNULL
 #endif
 
-typedef enum {
-    BLST_SUCCESS = 0,
-    BLST_BAD_ENCODING,
-    BLST_POINT_NOT_ON_CURVE,
-    BLST_POINT_NOT_IN_GROUP,
-    BLST_AGGR_TYPE_MISMATCH,
-    BLST_VERIFY_FAIL,
-    BLST_PK_IS_INFINITY,
-    BLST_BAD_SCALAR,
-} BLST_ERROR;
+// TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last
+// enum value (eventually submit a fix to BLST)
+#define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1)
 
 // field elements F_r
 // where `r` is the order of G1/G2.
@@ -72,7 +67,8 @@ typedef enum {
 // are represented as a little endian vector of limbs.
 // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements).
 // `Fr` is defined as a struct to be exportable through cgo to the Go layer.
-typedef struct {limb_t limbs[Fr_LIMBS];} Fr;
+#define R_BITS
+typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS
 
 // field elements F_p
 // F_p elements are represented as big numbers reduced modulo `p`. Big numbers
@@ -86,19 +82,23 @@ typedef vec384 Fp;
 // where x, y, x are elements of F_p (type `Fp`).
 // `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements)
 // `G1` is defined as a struct to be exportable through cgo to the Go layer.
-typedef struct {Fp x,y,z} G1;
+typedef struct {Fp x,y,z;} G1;
 
 // field elements F_p^2
 // F_p^2 elements are represented as a vector of two F_p elements.
 // `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 elements).
 // `Fp2` does not need to be exported to cgo.
-typedef vec384x Fp2;
+typedef vec384x Fp2;   
+// helpers to get "real" and "imaginary" Fp elements from Fp2 pointers
+#define real(p)  ((*(p))[0])  
+#define imag(p)  ((*(p))[1]) 
+
 
 // Subroup G2 in E2
 // G2 points are represented in Jacobian coordinates (x,y,z), 
 // where x, y, x are elements of F_p (type `Fp`).
 // `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements)
 // `G2` is defined as a struct to be exportable through cgo to the Go layer.
-typedef struct {Fp2 x,y,z} G2;
+typedef struct {Fp2 x,y,z;} G2;
 
 #endif
diff --git a/crypto/blst_src.c b/crypto/blst_src/blst_src.c
similarity index 99%
rename from crypto/blst_src.c
rename to crypto/blst_src/blst_src.c
index 89388b703fe..4b0732e06e4 100644
--- a/crypto/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -17,3 +17,4 @@
 #include "consts.c"
 #include "vect.c"
 #include "exports.c"
+
diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c
deleted file mode 100644
index df11e3dae73..00000000000
--- a/crypto/blst_src/client_min_pk.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*#include "keygen.c"
-#include "e2.c"
-#include "hash_to_field.c"
-#include "map_to_g2.c"
-#include "e1.c"
-#include "exp.c"
-#include "sqrt.c"
-#include "recip.c"
-#include "consts.c"
-#include "vect.c"
-#include "exports.c"*/
diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c
deleted file mode 100644
index fffbd5ad52d..00000000000
--- a/crypto/blst_src/client_min_sig.c
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-/*#include "keygen.c"
-#include "e1.c"
-#include "hash_to_field.c"
-#include "map_to_g1.c"
-#include "e2.c"
-#include "exp.c"
-#include "sqrt.c"
-#include "recip.c"
-#include "consts.c"
-#include "vect.c"
-#include "exports.c"*/
diff --git a/crypto/dkg.go b/crypto/dkg.go
index 1cdf87a128e..1254db615f3 100644
--- a/crypto/dkg.go
+++ b/crypto/dkg.go
@@ -1,5 +1,7 @@
 package crypto
 
+/*
+
 import (
 	"errors"
 	"fmt"
@@ -235,3 +237,4 @@ type DKGProcessor interface {
 	// log describes the misbehavior.
 	FlagMisbehavior(participant int, log string)
 }
+*/
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 5fdd6db7c79..9bf9dd8b2fc 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -1,7 +1,7 @@
 // +build relic
 
 #include "dkg_include.h"
-
+/*
 
 #define N_max 250
 #define N_bits_max 8  // log(250)  
@@ -69,12 +69,12 @@ void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int
 }
 
 // export an array of ep2_st into an array of bytes
-// the length matching is supposed to be checked
+// the array must be of length (len * G2_SER_BYTES)
 void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len) {
     const int size = (G2_BYTES/(G2_SERIALIZATION+1));
     byte* p = out;
     for (int i=0; i<len; i++){
-        ep2_write_bin_compact(p, &A[i], size);
+        E2_write_bytes(p, &A[i]);
         p += size;
     }
 }
@@ -84,15 +84,16 @@ void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len) {
 //
 // It returns RLC_OK if reading all the vector succeeded and RLC_ERR 
 // otherwise.
-int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len){
+int G2_vector_read_bin(ep2_st* A, const byte* src, const int len){
     const int size = (G2_BYTES/(G2_SERIALIZATION+1));
     byte* p = (byte*) src;
     for (int i=0; i<len; i++){
-        int read_ret = G2_read_bytes(&A[i], p, size); // returns RLC_OK or RLC_ERR
-        if (read_ret != RLC_OK)
+        int read_ret = E2_read_bytes(&A[i], p, size);
+        if (read_ret != BLST_SUCCESS)
             return read_ret;
         p += size;
     }
+    // TODO: add G2 subgroup check
     return RLC_OK;
 }
 
@@ -107,3 +108,4 @@ int verifyshare(const Fr* x, const ep2_t y) {
     return (ep2_cmp(res, (ep2_st*)y) == RLC_EQ);
 }
 
+*/
\ No newline at end of file
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 5db62e8672c..a5eb5412cc8 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -3,6 +3,8 @@
 
 package crypto
 
+/*
+
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -413,7 +415,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 // x being a small integer
 func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
 	C.Fr_polynomialImage_export((*C.uchar)(&dest[0]),
-		(*C.ep2_st)(y),
+		(*C.G2)(y),
 		(*C.Fr)(&a[0]), (C.int)(len(a)),
 		(C.uint8_t)(x),
 	)
@@ -423,7 +425,7 @@ func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
 // assuming the array length matches the vector length
 func writeVerifVector(dest []byte, A []pointG2) {
 	C.ep2_vector_write_bin((*C.uchar)(&dest[0]),
-		(*C.ep2_st)(&A[0]),
+		(*C.G2)(&A[0]),
 		(C.int)(len(A)),
 	)
 }
@@ -431,20 +433,20 @@ func writeVerifVector(dest []byte, A []pointG2) {
 // readVerifVector imports A vector from an array of bytes,
 // assuming the slice length matches the vector length
 func readVerifVector(A []pointG2, src []byte) error {
-	read := C.ep2_vector_read_bin((*C.ep2_st)(&A[0]),
+	read := C.G2_vector_read_bin((*C.G2)(&A[0]),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(A)))
 	if read == valid {
 		return nil
 	}
 	// invalid A vector
-	return invalidInputsErrorf("the verifcation vector does not serialize G2 points")
+	return invalidInputsErrorf("the verifcation vector does not serialize valid E2 points: error code %d", read)
 }
 
 func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
 	return C.verifyshare((*C.Fr)(&s.x),
-		(*C.ep2_st)(&s.y[s.myIndex])) == 1
+		(*C.G2)(&s.y[s.myIndex])) == 1
 }
 
 // computePublicKeys extracts the participants public keys from the verification vector
@@ -453,7 +455,8 @@ func (s *feldmanVSSstate) verifyShare() bool {
 //	Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 func (s *feldmanVSSstate) computePublicKeys() {
 	C.G2_polynomialImages(
-		(*C.ep2_st)(&s.y[0]), (C.int)(len(s.y)),
-		(*C.ep2_st)(&s.vA[0]), (C.int)(len(s.vA)),
+		(*C.G2)(&s.y[0]), (C.int)(len(s.y)),
+		(*C.G2)(&s.vA[0]), (C.int)(len(s.vA)),
 	)
 }
+*/
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index ff9dad35879..bfdea5b66ba 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -456,6 +457,9 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) {
 	}
 
 	s.y = make([]pointG2, s.size)
+	// compute all public keys
+	// TODO: could optimize to compute this step only to check complaint answers,
+	// and then for inputs from qualified leaders (at End call)
 	s.computePublicKeys()
 
 	// check the (already) registered complaints
@@ -506,7 +510,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool {
 	// check y[complainer] == share.G2
 	return C.verifyshare((*C.Fr)(&c.answer),
-		(*C.ep2_st)(&s.y[complainer])) == 0
+		(*C.G2)(&s.y[complainer])) == 0
 }
 
 // data = |complainee|
@@ -665,3 +669,4 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 	}
 }
+*/
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 07a8da234cf..22e755d55ae 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -9,7 +9,7 @@ void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size
 void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
 void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A);
 void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len);
-int  ep2_vector_read_bin(ep2_st* A, const byte* src, const int len);
+int  G2_vector_read_bin(ep2_st* A, const byte* src, const int len);
 int  verifyshare(const Fr* x, const ep2_t y);
 
 #endif
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index bef857fba37..7ee0a9773d5 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -3,6 +3,8 @@
 
 package crypto
 
+/*
+
 // #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
@@ -307,13 +309,13 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2
 		(C.int)(qualified))
 	// sum up Y
 	var jointPublicKey pointG2
-	C.ep2_sum_vector((*C.ep2_st)(&jointPublicKey),
-		(*C.ep2_st)(&qualifiedPubKey[0]), (C.int)(qualified))
+	C.ep2_sum_vector((*C.G2)(&jointPublicKey),
+		(*C.G2)(&qualifiedPubKey[0]), (C.int)(qualified))
 	// sum up []y
 	jointy := make([]pointG2, s.size)
 	for i := 0; i < s.size; i++ {
-		C.ep2_sum_vector((*C.ep2_st)(&jointy[i]),
-			(*C.ep2_st)(&qualifiedy[i][0]), (C.int)(qualified))
+		C.ep2_sum_vector((*C.G2)(&jointy[i]),
+			(*C.G2)(&qualifiedy[i][0]), (C.int)(qualified))
 	}
 	return &jointx, &jointPublicKey, jointy
 }
@@ -338,3 +340,4 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2
 	}
 	return qualifiedx, qualifiedPubKey, qualifiedy
 }
+*/
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index da0e05782a0..104cb8ef56f 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"fmt"
 	mrand "math/rand"
@@ -766,7 +767,7 @@ func TestDKGErrorTypes(t *testing.T) {
 		assert.False(t, IsDKGInvalidStateTransitionError(otherError))
 		assert.False(t, IsDKGInvalidStateTransitionError(nil))
 	})
-} /*
+}
 
 func TestDKGTransitionErrors(t *testing.T) {
 	n := 5
diff --git a/crypto/spock.go b/crypto/spock.go
index a4087316319..18c39f8af15 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 // SPoCK design based on the BLS signature scheme.
 // BLS is using BLS12-381 curve and the same settings in bls.go.
 
@@ -90,9 +91,9 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 	}
 
 	// verify the spock proof using the secret data
-	verif := C.bls_spock_verify((*C.ep2_st)(&blsPk1.point),
+	verif := C.bls_spock_verify((*C.G2)(&blsPk1.point),
 		(*C.uchar)(&proof1[0]),
-		(*C.ep2_st)(&blsPk2.point),
+		(*C.G2)(&blsPk2.point),
 		(*C.uchar)(&proof2[0]))
 
 	switch verif {
@@ -104,3 +105,4 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 		return false, fmt.Errorf("SPoCK verification failed")
 	}
 }
+*/
diff --git a/crypto/spock_test.go b/crypto/spock_test.go
index 45db590f04e..408e513bae0 100644
--- a/crypto/spock_test.go
+++ b/crypto/spock_test.go
@@ -3,6 +3,7 @@
 
 package crypto
 
+/*
 import (
 	"crypto/rand"
 	"testing"
@@ -183,3 +184,4 @@ func TestSPOCKProveVerify(t *testing.T) {
 		assert.False(t, result)
 	})
 }
+*/
diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go
index 2dae7061b76..63c4231beff 100644
--- a/crypto/thresholdsign.go
+++ b/crypto/thresholdsign.go
@@ -1,5 +1,6 @@
 package crypto
 
+/*
 import (
 	"errors"
 	"fmt"
@@ -144,3 +145,4 @@ func IsNotEnoughSharesError(err error) bool {
 	var target *notEnoughSharesError
 	return errors.As(err, &target)
 }
+*/

From a5738f682c8b5388753383cd52bfc9c9c7f65b99 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 13 Apr 2023 17:50:31 -0600
Subject: [PATCH 027/200] G2 type working for BLS

---
 crypto/bls.go             |   2 +-
 crypto/bls12381_utils.c   |  69 +++++--
 crypto/bls12381_utils.go  |   2 +-
 crypto/bls12381_utils.h   |   3 +
 crypto/bls_core.c         |   4 +-
 crypto/bls_test.go        |   4 +-
 crypto/blst_include.h     |   4 +-
 crypto/sign_test_utils.go | 396 +++++++++++++++++++-------------------
 8 files changed, 269 insertions(+), 215 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 66f4c809e85..65113f873ba 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -348,7 +348,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 	var pk pubKeyBLSBLS12381
 	err := readPointG2(&pk.point, publicKeyBytes)
 	if err != nil {
-		return nil, fmt.Errorf("decode public key failed %w", err)
+		return nil, fmt.Errorf("decode public key failed: %w", err)
 	}
 
 	// membership check in G2
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index b66be0932a2..3b643933366 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -493,7 +493,7 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
-static limb_t Fp_get_sign(const fp_t y) {
+static byte Fp_get_sign(const fp_t y) {
     return sgn0_pty_mont_384(y, BLS12_381_P, p0);
 }
 
@@ -530,15 +530,19 @@ void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
 
 // checks if `a` is a quadratic residue in Fp^2. If yes, it computes 
 // the square root in `res`.
+// 
+// The boolean output is valid whether `a` is in Montgomery form or not,
+// since montgomery constant `R` is a quadratic residue.
+// However, the square root is valid only if `a` is in montgomery form.
 static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) {
    return sqrt_fp2((vec384*)res, (vec384*)a);
 }
 
 // returns the sign of y.
 // sign(y_0) if y_1 = 0, else sign(y_1)
-// y coordinates are in montgommery form
-static limb_t Fp2_get_sign(Fp2* y) {
-    return sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0);
+// y coordinates must be in montgomery form
+static byte Fp2_get_sign(Fp2* y) {
+    return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1;
 }
 
 // reads an Fp^2 element in `a`.
@@ -595,7 +599,7 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     } 
 
     // check if the point is infinity
-    int is_infinity = bin[0] & 0x40;
+    int is_infinity = bin[0] & (1<<6);
     if (is_infinity) {
         // check if the remaining bits are cleared
         if (bin[0] & 0x3F) {
@@ -645,6 +649,15 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     return RLC_ERR;
 }
 
+
+// TODO: delete aftet deleting ep_write_bin_compact
+static int fp_get_sign(const fp_t y) {
+    bn_t bn_y;
+    bn_new(bn_y);
+    fp_prime_back(bn_y, y);
+    return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT;		
+}
+
 // ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form.
 // len is the allocated size of the buffer bin.
 // The serialization is following:
@@ -660,7 +673,7 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
  
     if (ep_is_infty(a)) {
             // set the infinity bit
-            bin[0] = (G1_SERIALIZATION << 7) | 0x40;
+            bin[0] = (G1_SERIALIZATION << 7) | (1<<6);
             memset(bin+1, 0, G1_size-1);
             return;
     }
@@ -673,7 +686,7 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
         fp_write_bin(bin, Fp_BYTES, t->x);
 
         if (G1_SERIALIZATION == COMPRESSED) {
-            bin[0] |= (Fp_get_sign(t->y) << 5);
+            bin[0] |= (fp_get_sign(t->y) << 5);
         } else {
             fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y);
         }
@@ -881,11 +894,11 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) {
     Fp2* a_y = &(a->y);
     Fp2_squ_montg(a_y, a_x);
     Fp2_mul_montg(a_y, a_y, a_x);
-    Fp2_add(a_y, a_y, &B_E2);                       
-    if (!Fp2_sqrt(a_y, a_y))  // if (y^2 = x^3+b) has no solution in y
+    Fp2_add(a_y, a_y, &B_E2);          // B_E2 is already in Montg form             
+    if (!Fp2_sqrt(a_y, a_y))    // check whether x^3+b is a quadratic residue
         return BLST_POINT_NOT_ON_CURVE; 
 
-    // resulting (x,y) is guaranteed to be on curve
+    // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
     if (Fp2_get_sign(a_y) != y_sign) {
         Fp2_neg(a_y, a_y); // flip y sign if needed
     }
@@ -900,13 +913,13 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) {
 void E2_write_bytes(byte *bin, const G2* a) {
     if (E2_is_infty(a)) {
             // set the infinity bit
-            bin[0] = (G2_SERIALIZATION << 7) | 0x40;
+            bin[0] = (G2_SERIALIZATION << 7) | (1 << 6);
             memset(bin+1, 0, G2_SER_BYTES-1);
             return;
     }
-
     G2 tmp;
     E2_to_affine(&tmp, a);
+
     Fp2* t_x = &(tmp.x);
     Fp_from_montg(&real(t_x), &real(t_x));
     Fp_from_montg(&imag(t_x), &imag(t_x));
@@ -916,6 +929,8 @@ void E2_write_bytes(byte *bin, const G2* a) {
     if (G2_SERIALIZATION == COMPRESSED) {
         bin[0] |= (Fp2_get_sign(t_y) << 5);
     } else {
+        Fp_from_montg(&real(t_y), &real(t_y));
+        Fp_from_montg(&imag(t_y), &imag(t_y));
         Fp2_write_bytes(bin + Fp2_BYTES, t_y);
     }
 
@@ -1275,6 +1290,36 @@ void Fr_print_(char* s, Fr* a) {
         printf("%16llx", *(--p));
     printf("\n");
 }
+
+void Fp_print_(char* s, Fp* a) {
+    printf("[%s]:\n", s);
+    limb_t* p = (limb_t*)(a) + Fp_LIMBS;
+    for (int i=0; i<Fp_LIMBS; i++) 
+        printf("%16llx", *(--p));
+    printf("\n");
+}
+
+void Fp2_print_(char* s, const Fp2* a) {
+    printf("[%s]:\n", s);
+    Fp tmp;
+    Fp_from_montg(&tmp, &real(a));
+    limb_t* p = (limb_t*)(&tmp) + Fp_LIMBS;
+    for (int i=0; i<Fp_LIMBS; i++) 
+        printf("%16llx", *(--p));
+    printf("\n");
+    Fp_from_montg(&tmp, &imag(a));
+    p = (limb_t*)(&tmp) + Fp_LIMBS;
+    for (int i=0; i<Fp_LIMBS; i++) 
+        printf("%16llx", *(--p));
+    printf("\n");
+}
+
+void E2_print_(char* s, const G2* a) {
+      printf("[%s]:\n", s);
+      Fp2_print_(".x", &(a->x));
+      Fp2_print_(".y", &(a->y));
+      Fp2_print_(".z", &(a->z));
+}
  
 
 void fp_print_(char* s, fp_st a) {
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 9d59eb8d7d4..2297e434c2f 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -119,7 +119,7 @@ func (x *scalar) isZero() bool {
 
 // Comparison to point at infinity in G2.
 func (p *pointG2) isInfinity() bool {
-	return C.E2_is_infty((*C.G2)(p)) != 10
+	return C.E2_is_infty((*C.G2)(p)) != 0
 }
 
 // returns a random element of Fr in input pointer
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index a6688c5871d..521941c7fee 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -166,6 +166,9 @@ void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int);
 // Debugging related functions
 void     bytes_print_(char*, byte*, int);
 void     Fr_print_(char*, Fr*);
+void     Fp_print_(char*, Fp*);
+void     Fp2_print_(char*, const Fp2*);
+void     E2_print_(char*, const G2*);
 void     fp_print_(char*, fp_t);
 void     bn_print_(char*, bn_st*);
 void     ep_print_(char*, ep_st*);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 03fa21ca782..cfabca52719 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -101,7 +101,7 @@ static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int
 
     // elemsG2[1] = pk
     ep2_new(elemsG2[1]);
-    ep2_copy(elemsG2[1], (ep2_st*)pk_tmp); 
+    ep2_copy(elemsG2[1], pk_tmp); 
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -336,11 +336,13 @@ int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) {
     // deserialize the signature into a curve point
     int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN);
     if (read_ret != RLC_OK) {
+        printf("HHH1\n");
         return read_ret;
     }
 
     // check s is in G1
     if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
+        printf("HHH2\n");
         return INVALID;
     }
     
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 7a93dd04998..2965326fb66 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -18,7 +18,7 @@ import (
 )
 
 // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify)
-/*func TestBLSMainMethods(t *testing.T) {
+func TestBLSMainMethods(t *testing.T) {
 	// test the key generation seed lengths
 	testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen)
 	// test the consistency with different inputs
@@ -63,7 +63,7 @@ func BenchmarkBLSBLS12381Sign(b *testing.B) {
 func BenchmarkBLSBLS12381Verify(b *testing.B) {
 	halg := NewExpandMsgXOFKMAC128("bench tag")
 	benchVerify(b, BLSBLS12381, halg)
-}*/
+}
 
 // utility function to generate a random BLS private key
 func randomSK(t *testing.T, seed []byte) PrivateKey {
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index d33ec372be6..c480a68d27e 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -6,11 +6,11 @@
 // extra tools to use BLST low level that are needed by the Flow crypto library
 // eventually this file would replace blst.h
 
+#include "bls12381_utils.h"
 #include "point.h"
 #include "fields.h"
 #include "consts.h"
 #include "errors.h"
-#include "bls12381_utils.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
@@ -67,7 +67,7 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
 // are represented as a little endian vector of limbs.
 // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements).
 // `Fr` is defined as a struct to be exportable through cgo to the Go layer.
-#define R_BITS
+#define R_BITS 255
 typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS
 
 // field elements F_p
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index e9198a0c7b5..8a81e5bb45a 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -47,62 +47,62 @@ func TestHasherErrors(t *testing.T) {
 
 // tests sign and verify are consistent for multiple generated keys and messages
 func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
-	t.Logf("Testing Generation/Signature/Verification for %s", salg)
-	// make sure the length is larger than minimum lengths of all the signaure algos
-	seedMinLength := 48
-	seed := make([]byte, seedMinLength)
-	input := make([]byte, 100)
-	r := time.Now().UnixNano()
-	mrand.Seed(r)
-	t.Logf("math rand seed is %d", r)
-
-	loops := 50
-	for j := 0; j < loops; j++ {
-		n, err := mrand.Read(seed)
-		require.Equal(t, n, seedMinLength)
-		require.NoError(t, err)
-		sk, err := GeneratePrivateKey(salg, seed)
-		require.NoError(t, err)
-		_, err = mrand.Read(input)
-		require.NoError(t, err)
-		s, err := sk.Sign(input, halg)
-		require.NoError(t, err)
-		pk := sk.PublicKey()
+	t.Run(fmt.Sprintf("Testing Generation/Signature/Verification for %s", salg), func(t *testing.T) {
+		seed := make([]byte, KeyGenSeedMinLen)
+		input := make([]byte, 100)
+		r := time.Now().UnixNano()
+		mrand.Seed(r)
+		t.Logf("math rand seed is %d", r)
 
-		// test a valid signature
-		result, err := pk.Verify(s, input, halg)
-		require.NoError(t, err)
-		assert.True(t, result, fmt.Sprintf(
-			"Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+		loops := 50
+		for j := 0; j < loops; j++ {
+			n, err := mrand.Read(seed)
+			require.Equal(t, n, KeyGenSeedMinLen)
+			require.NoError(t, err)
+			sk, err := GeneratePrivateKey(salg, seed)
+			require.NoError(t, err)
+			_, err = mrand.Read(input)
+			require.NoError(t, err)
+			s, err := sk.Sign(input, halg)
+			require.NoError(t, err)
+			pk := sk.PublicKey()
 
-		// test with a different message
-		input[0] ^= 1
-		result, err = pk.Verify(s, input, halg)
-		require.NoError(t, err)
-		assert.False(t, result, fmt.Sprintf(
-			"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
-		input[0] ^= 1
+			// test a valid signature
+			result, err := pk.Verify(s, input, halg)
+			require.NoError(t, err)
+			assert.True(t, result, fmt.Sprintf(
+				"Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
 
-		// test with a valid but different key
-		seed[0] ^= 1
-		wrongSk, err := GeneratePrivateKey(salg, seed)
-		require.NoError(t, err)
-		result, err = wrongSk.PublicKey().Verify(s, input, halg)
-		require.NoError(t, err)
-		assert.False(t, result, fmt.Sprintf(
-			"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+			// test with a different message
+			input[0] ^= 1
+			result, err = pk.Verify(s, input, halg)
+			require.NoError(t, err)
+			assert.False(t, result, fmt.Sprintf(
+				"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+			input[0] ^= 1
+
+			// test with a valid but different key
+			seed[0] ^= 1
+			wrongSk, err := GeneratePrivateKey(salg, seed)
+			require.NoError(t, err)
+			result, err = wrongSk.PublicKey().Verify(s, input, halg)
+			require.NoError(t, err)
+			assert.False(t, result, fmt.Sprintf(
+				"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+
+			// test a wrong signature length
+			invalidLen := mrand.Intn(2 * len(s)) // try random invalid lengths
+			if invalidLen == len(s) {            // map to an invalid length
+				invalidLen = 0
+			}
+			invalidSig := make([]byte, invalidLen)
+			result, err = pk.Verify(invalidSig, input, halg)
+			require.NoError(t, err)
+			assert.False(t, result, fmt.Sprintf(
+				"Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen))
 
-		// test a wrong signature length
-		invalidLen := mrand.Intn(2 * len(s)) // try random invalid lengths
-		if invalidLen == len(s) {            // map to an invalid length
-			invalidLen = 0
 		}
-		invalidSig := make([]byte, invalidLen)
-		result, err = pk.Verify(invalidSig, input, halg)
-		require.NoError(t, err)
-		assert.False(t, result, fmt.Sprintf(
-			"Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen))
-	}
+	})
 }
 
 // tests the key generation constraints with regards to the input seed, mainly
@@ -154,167 +154,171 @@ func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int)
 }
 
 func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
-	t.Logf("Testing encode/decode for %s", salg)
-	r := time.Now().UnixNano()
-	mrand.Seed(r)
-	t.Logf("math rand seed is %d", r)
-	// make sure the length is larger than minimum lengths of all the signaure algos
-	seedMinLength := 48
-
-	t.Run("happy path tests", func(t *testing.T) {
-		loops := 50
-		for j := 0; j < loops; j++ {
-			// generate a private key
-			seed := make([]byte, seedMinLength)
-			read, err := mrand.Read(seed)
-			require.Equal(t, read, seedMinLength)
-			require.NoError(t, err)
-			sk, err := GeneratePrivateKey(salg, seed)
-			assert.Nil(t, err, "the key generation failed")
-			seed[0] ^= 1 // alter the seed to get a new private key
-			distinctSk, err := GeneratePrivateKey(salg, seed)
-			require.NoError(t, err)
-
-			// check private key encoding
-			skBytes := sk.Encode()
-			skCheck, err := DecodePrivateKey(salg, skBytes)
-			require.Nil(t, err, "the key decoding failed")
-			assert.True(t, sk.Equals(skCheck), "key equality check failed")
-			skCheckBytes := skCheck.Encode()
-			assert.Equal(t, skBytes, skCheckBytes, "keys should be equal")
-			distinctSkBytes := distinctSk.Encode()
-			assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different")
-
-			// check public key encoding
-			pk := sk.PublicKey()
-			pkBytes := pk.Encode()
-			pkCheck, err := DecodePublicKey(salg, pkBytes)
-			require.Nil(t, err, "the key decoding failed")
-			assert.True(t, pk.Equals(pkCheck), "key equality check failed")
-			pkCheckBytes := pkCheck.Encode()
-			assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal")
-			distinctPkBytes := distinctSk.PublicKey().Encode()
-			assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different")
-
-			// same for the compressed encoding
-			pkComprBytes := pk.EncodeCompressed()
-			pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes)
-			require.Nil(t, err, "the key decoding failed")
-			assert.True(t, pk.Equals(pkComprCheck), "key equality check failed")
-			pkCheckComprBytes := pkComprCheck.EncodeCompressed()
-			assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal")
-			distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed()
-			assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different")
-		}
-	})
-
-	// test invalid private keys (equal to the curve group order)
-	t.Run("private keys equal to the group order", func(t *testing.T) {
-		groupOrder := make(map[SigningAlgorithm][]byte)
-		groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255,
-			255, 255, 255, 255, 255, 188, 230, 250, 173, 167,
-			23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81}
+	t.Run(fmt.Sprintf("Testing encode/decode for %s", salg), func(t *testing.T) {
+		r := time.Now().UnixNano()
+		mrand.Seed(r)
+		t.Logf("math rand seed is %d", r)
 
-		groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-			255, 255, 255, 255, 255, 254, 186, 174, 220, 230,
-			175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65}
+		t.Run("happy path tests", func(t *testing.T) {
+			loops := 50
+			for j := 0; j < loops; j++ {
+				// generate a private key
+				seed := make([]byte, KeyGenSeedMinLen)
+				read, err := mrand.Read(seed)
+				require.Equal(t, read, KeyGenSeedMinLen)
+				require.NoError(t, err)
+				sk, err := GeneratePrivateKey(salg, seed)
+				assert.Nil(t, err, "the key generation failed")
+				seed[0] ^= 1 // alter the seed to get a new private key
+				distinctSk, err := GeneratePrivateKey(salg, seed)
+				require.NoError(t, err)
+
+				// check private key encoding
+				skBytes := sk.Encode()
+				skCheck, err := DecodePrivateKey(salg, skBytes)
+				require.Nil(t, err, "the key decoding failed")
+				assert.True(t, sk.Equals(skCheck), "key equality check failed")
+				skCheckBytes := skCheck.Encode()
+				assert.Equal(t, skBytes, skCheckBytes, "keys should be equal")
+				distinctSkBytes := distinctSk.Encode()
+				assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different")
+
+				// check public key encoding
+				pk := sk.PublicKey()
+				pkBytes := pk.Encode()
+				pkCheck, err := DecodePublicKey(salg, pkBytes)
+				require.Nil(t, err, "the key decoding failed")
+				assert.True(t, pk.Equals(pkCheck), "key equality check failed")
+				pkCheckBytes := pkCheck.Encode()
+				assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal")
+				distinctPkBytes := distinctSk.PublicKey().Encode()
+				assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different")
+
+				// same for the compressed encoding
+				pkComprBytes := pk.EncodeCompressed()
+				pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes)
+				require.Nil(t, err, "the key decoding failed")
+				assert.True(t, pk.Equals(pkComprCheck), "key equality check failed")
+				pkCheckComprBytes := pkComprCheck.EncodeCompressed()
+				assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal")
+				distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed()
+				assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different")
+			}
+		})
+
+		// test invalid private keys (equal to the curve group order)
+
+		t.Run("private keys equal to the group order", func(t *testing.T) {
+			groupOrder := make(map[SigningAlgorithm][]byte)
+			groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255,
+				255, 255, 255, 255, 255, 188, 230, 250, 173, 167,
+				23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81}
+
+			groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+				255, 255, 255, 255, 255, 254, 186, 174, 220, 230,
+				175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65}
+
+			groupOrder[BLSBLS12381] = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39,
+				0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE,
+				0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01}
+
+			sk, err := DecodePrivateKey(salg, groupOrder[salg])
+			require.Error(t, err, "the key decoding should fail - private key value is too large")
+			assert.True(t, IsInvalidInputsError(err))
+			assert.Nil(t, sk)
+		})
 
-		groupOrder[BLSBLS12381] = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39,
-			0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE,
-			0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01}
+		// test invalid private and public keys (invalid length)
 
-		sk, err := DecodePrivateKey(salg, groupOrder[salg])
-		require.Error(t, err, "the key decoding should fail - private key value is too large")
-		assert.True(t, IsInvalidInputsError(err))
-		assert.Nil(t, sk)
-	})
+		t.Run("invalid key length", func(t *testing.T) {
+			// private key
+			skLens := make(map[SigningAlgorithm]int)
+			skLens[ECDSAP256] = PrKeyLenECDSAP256
+			skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1
+			skLens[BLSBLS12381] = 32
 
-	// test invalid private and public keys (invalid length)
-	t.Run("invalid key length", func(t *testing.T) {
-		// private key
-		skLens := make(map[SigningAlgorithm]int)
-		skLens[ECDSAP256] = PrKeyLenECDSAP256
-		skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1
-		skLens[BLSBLS12381] = 32
-
-		bytes := make([]byte, skLens[salg]+1)
-		sk, err := DecodePrivateKey(salg, bytes)
-		require.Error(t, err)
-		assert.True(t, IsInvalidInputsError(err))
-		assert.Nil(t, sk)
+			bytes := make([]byte, skLens[salg]+1)
+			sk, err := DecodePrivateKey(salg, bytes)
+			require.Error(t, err)
+			assert.True(t, IsInvalidInputsError(err))
+			assert.Nil(t, sk)
 
-		// public key
-		pkLens := make(map[SigningAlgorithm]int)
-		pkLens[ECDSAP256] = PubKeyLenECDSAP256
-		pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1
-		pkLens[BLSBLS12381] = 96
+			// public key
+			pkLens := make(map[SigningAlgorithm]int)
+			pkLens[ECDSAP256] = PubKeyLenECDSAP256
+			pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1
+			pkLens[BLSBLS12381] = 96
 
-		bytes = make([]byte, pkLens[salg]+1)
-		pk, err := DecodePublicKey(salg, bytes)
-		require.Error(t, err)
-		assert.True(t, IsInvalidInputsError(err))
-		assert.Nil(t, pk)
+			bytes = make([]byte, pkLens[salg]+1)
+			pk, err := DecodePublicKey(salg, bytes)
+			require.Error(t, err)
+			assert.True(t, IsInvalidInputsError(err))
+			assert.Nil(t, pk)
+		})
 	})
 }
 
 func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) {
-	t.Logf("Testing Equals for %s", salg)
-	r := time.Now().UnixNano()
-	mrand.Seed(r)
-	t.Logf("math rand seed is %d", r)
-	// make sure the length is larger than minimum lengths of all the signaure algos
-	seedMinLength := 48
-
-	// generate a key pair
-	seed := make([]byte, seedMinLength)
-	n, err := mrand.Read(seed)
-	require.Equal(t, n, seedMinLength)
-	require.NoError(t, err)
-
-	// first pair
-	sk1, err := GeneratePrivateKey(salg, seed)
-	require.NoError(t, err)
-	pk1 := sk1.PublicKey()
-
-	// second pair without changing the seed
-	sk2, err := GeneratePrivateKey(salg, seed)
-	require.NoError(t, err)
-	pk2 := sk2.PublicKey()
-
-	// unrelated algo pair
-	sk3, err := GeneratePrivateKey(otherSigAlgo, seed)
-	require.NoError(t, err)
-	pk3 := sk3.PublicKey()
-
-	// fourth pair with same algo but a different seed
-	seed[0] ^= 1
-	sk4, err := GeneratePrivateKey(salg, seed)
-	require.NoError(t, err)
-	pk4 := sk4.PublicKey()
-
-	// tests
-	assert.True(t, sk1.Equals(sk2), "key equality should return true")
-	assert.True(t, pk1.Equals(pk2), "key equality should return true")
-	assert.False(t, sk1.Equals(sk3), "key equality should return false")
-	assert.False(t, pk1.Equals(pk3), "key equality should return false")
-	assert.False(t, sk1.Equals(sk4), "key equality should return false")
-	assert.False(t, pk1.Equals(pk4), "key equality should return false")
+	t.Run(fmt.Sprintf("Testing Equals for %s", salg), func(t *testing.T) {
+		r := time.Now().UnixNano()
+		mrand.Seed(r)
+		t.Logf("math rand seed is %d", r)
+		// make sure the length is larger than minimum lengths of all the signaure algos
+		seedMinLength := 48
+
+		// generate a key pair
+		seed := make([]byte, seedMinLength)
+		n, err := mrand.Read(seed)
+		require.Equal(t, n, seedMinLength)
+		require.NoError(t, err)
+
+		// first pair
+		sk1, err := GeneratePrivateKey(salg, seed)
+		require.NoError(t, err)
+		pk1 := sk1.PublicKey()
+
+		// second pair without changing the seed
+		sk2, err := GeneratePrivateKey(salg, seed)
+		require.NoError(t, err)
+		pk2 := sk2.PublicKey()
+
+		// unrelated algo pair
+		sk3, err := GeneratePrivateKey(otherSigAlgo, seed)
+		require.NoError(t, err)
+		pk3 := sk3.PublicKey()
+
+		// fourth pair with same algo but a different seed
+		seed[0] ^= 1
+		sk4, err := GeneratePrivateKey(salg, seed)
+		require.NoError(t, err)
+		pk4 := sk4.PublicKey()
+
+		// tests
+		assert.True(t, sk1.Equals(sk2), "key equality should return true")
+		assert.True(t, pk1.Equals(pk2), "key equality should return true")
+		assert.False(t, sk1.Equals(sk3), "key equality should return false")
+		assert.False(t, pk1.Equals(pk3), "key equality should return false")
+		assert.False(t, sk1.Equals(sk4), "key equality should return false")
+		assert.False(t, pk1.Equals(pk4), "key equality should return false")
+	})
 }
 
 func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) {
-	t.Logf("Testing key.Algorithm for %s", salg)
-	alg := sk.Algorithm()
-	assert.Equal(t, alg, salg)
-	alg = sk.PublicKey().Algorithm()
-	assert.Equal(t, alg, salg)
+	t.Run(fmt.Sprintf("Testing key.Algorithm for %s", salg), func(t *testing.T) {
+		alg := sk.Algorithm()
+		assert.Equal(t, alg, salg)
+		alg = sk.PublicKey().Algorithm()
+		assert.Equal(t, alg, salg)
+	})
 }
 
 func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) {
-	t.Logf("Testing key.Size for %s", sk.Algorithm())
-	size := sk.Size()
-	assert.Equal(t, size, skLen)
-	size = sk.PublicKey().Size()
-	assert.Equal(t, size, pkLen)
+	t.Run(fmt.Sprintf("Testing key.Size for %s", sk.Algorithm()), func(t *testing.T) {
+		size := sk.Size()
+		assert.Equal(t, size, skLen)
+		size = sk.PublicKey().Size()
+		assert.Equal(t, size, pkLen)
+	})
 }
 
 func benchVerify(b *testing.B, algo SigningAlgorithm, halg hash.Hasher) {

From 25b7be676700f50b82de352c7e201203f67f76b5 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 14 Apr 2023 12:51:46 -0600
Subject: [PATCH 028/200] integrate G2 in BLS multi-sig

---
 crypto/bls12381_utils.c       | 60 +++++++++++++++++++-----------
 crypto/bls12381_utils.h       |  8 +++-
 crypto/bls12381_utils_test.go |  1 -
 crypto/bls_core.c             | 69 ++++++++++++++++++++---------------
 crypto/bls_include.h          |  6 +--
 crypto/bls_multisig.go        | 13 ++-----
 crypto/bls_test.go            | 26 ++++++-------
 crypto/build_dependency.sh    |  2 +-
 crypto/dkg_jointfeldman.go    |  4 +-
 crypto/relic_build.sh         |  4 +-
 10 files changed, 108 insertions(+), 85 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 3b643933366..423a0a890b0 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -142,6 +142,16 @@ bn_st* Fr_blst_to_relic(const Fr* x) {
     return out;
 }
 
+// TODO: temp utility function to delete
+Fr* Fr_relic_to_blst(const bn_st* x){
+    Fr* out = (Fr*)malloc(sizeof(Fr)); 
+    byte* data = (byte*)malloc(Fr_BYTES);
+    bn_write_bin(data, Fr_BYTES, x);   
+    Fr_read_bytes(out, data, Fr_BYTES);
+    free(data);
+    return out;
+}
+
 // returns true if a == 0 and false otherwise
 bool_t Fr_is_zero(const Fr* a) {
     return bytes_are_zero((const byte*)a, Fr_BYTES);
@@ -159,7 +169,7 @@ void Fr_set_limb(Fr* a, const limb_t l){
 }
 
 void Fr_copy(Fr* res, const Fr* a) {
-    vec_copy((byte*)res, (byte*)a, Fr_BYTES);
+    vec_copy((byte*)res, (byte*)a, sizeof(Fr));
 }
 
 // sets `a` to 0
@@ -383,7 +393,7 @@ void Fp_set_limb(Fp* a, const limb_t l){
 }
 
 void Fp_copy(Fp* res, const Fp* a) {
-    vec_copy((byte*)res, (byte*)a, Fp_BYTES);
+    vec_copy((byte*)res, (byte*)a, sizeof(Fr));
 }
 
 static void Fp_add(Fp *res, const Fp *a, const Fp *b) {
@@ -960,30 +970,41 @@ bool_t E2_is_equal(const G2* p1, const G2* p2) {
     return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2);
 }
 
+// res = p
+void  E2_copy(G2* res, const G2* p) {
+    vec_copy(res, p, sizeof(G2));
+}
+
 // converts an E2 point from Jacobian into affine coordinates (z=1)
 void E2_to_affine(G2* res, const G2* p) {
     // minor optimization in case coordinates are already affine
     if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) {
-        vec_copy(res, p, G2_BYTES);
+        E2_copy(res, p);
         return;
     }
     // convert from Jacobian
     POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p);   
 }
 
+// generic point addition that must handle doubling and points at infinity
 void E2_add(G2* res, const G2* a, const G2* b) {
     POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); 
 }
 
-// Exponentiation of a generic point p in G2
-void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using window NAF of size 2
-    ep2_mul_lwnaf(res, p, tmp_expo);
-    free(tmp_expo);
+// Point negation in place.
+// no need for an api of the form E2_neg(G2* res, const G2* a) for now
+static void E2_neg(G2* a) {
+    POINTonE2_cneg((POINTonE2*)a, 1);
+}
+
+// Exponentiation of a generic point `a` in E2, res = expo.a
+void E2_mult(G2* res, const G2* a, const Fr* expo) {
+    pow256 tmp;
+    pow256_from_Fr(tmp, expo);
+    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)a, tmp);
 }
 
-// Exponentiation of generator g2 in G2
+// Exponentiation of generator g2 of G2, res = expo.g2
 void G2_mult_gen(G2* res, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
@@ -991,14 +1012,11 @@ void G2_mult_gen(G2* res, const Fr* expo) {
 }
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
-void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){
-    ep2_set_infty(jointy);
+void E2_sum_vector(G2* jointy, const G2* y, const int len){
+    E2_set_infty(jointy);
     for (int i=0; i<len; i++){
-        ep2_add_projc(jointy, jointy, &y[i]);
+        E2_add(jointy, jointy, &y[i]);
     }
-    ep2_norm(jointy, jointy); // not necessary but left here to optimize the 
-                            // multiple pairing computations with the same 
-                            // public key
 }
 
 // ------------------- other
@@ -1074,12 +1092,12 @@ int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const b
     return UNDEFINED;
 }
 
-// Subtracts the sum of a G2 array elements y from an element x and writes the 
+// Subtracts all G2 array elements `y` from an element `x` and writes the 
 // result in res
-void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len){
-    ep2_sum_vector(res, y, len);
-    ep2_neg(res, res);
-    ep2_add_projc(res, x, res);
+void E2_subtract_vector(G2* res, const G2* x, const G2* y, const int len){
+    E2_sum_vector(res, y, len);
+    E2_neg(res);
+    E2_add(res, x, res);
 }
 
 // computes the sum of the G1 array elements y and writes the sum in jointy
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 521941c7fee..a5f636c2655 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -82,6 +82,7 @@ typedef struct prec_ {
 
 // TODO: to delete when Relic is removed
 bn_st* Fr_blst_to_relic(const Fr* x);
+Fr*  Fr_relic_to_blst(const bn_st* x);
 ep2_st* E2_blst_to_relic(const G2* x);
 
 int      get_valid();
@@ -140,15 +141,18 @@ void        E2_set_infty(G2* p);
 bool_t      E2_is_infty(const G2*);
 bool_t      E2_affine_on_curve(const G2*);
 bool_t      E2_is_equal(const G2* p1, const G2* p2);
+void        E2_copy(G2*, const G2*);
 void        E2_to_affine(G2*, const G2*);
 BLST_ERROR  E2_read_bytes(G2*, const byte *,  const int); 
 void        E2_write_bytes(byte *, const G2*);
 void        G2_mult_gen(G2*, const Fr*);
+void        E2_mult(G2*, const G2*, const Fr*);
 void        E2_add(G2* res, const G2* a, const G2* b);
+void        E2_sum_vector(G2*, const G2*, const int);
 
 void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
-void     ep2_sum_vector(ep2_t, ep2_st*, const int);
-void     ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len);
+
+void     E2_subtract_vector(G2* res, const G2* x, const G2* y, const int len);
 int      G2_check_membership(const G2*);
 int      simple_subgroup_check_G2(const ep2_t);
 void     ep2_rand_G2(ep2_t);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index e8b34cbb052..337849c78f3 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -101,7 +101,6 @@ func BenchmarkMapToG1(b *testing.B) {
 
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
-
 	// seed Relic PRG
 	seed := make([]byte, securityBits/8)
 	_, _ = rand.Read(seed)
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index cfabca52719..5764ff64da8 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -153,7 +153,7 @@ static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int
 // the membership check is separated to allow optimizing multiple verifications using the same pks
 int bls_verifyPerDistinctMessage(const byte* sig, 
                          const int nb_hashes, const byte* hashes, const uint32_t* len_hashes,
-                         const uint32_t* pks_per_hash, const ep2_st* pks) {  
+                         const uint32_t* pks_per_hash, const G2* pks) {  
 
     int ret = UNDEFINED; // return value
     
@@ -189,9 +189,13 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 
     // aggregate public keys mapping to the same hash
     offset = 0;
+    G2 tmp;
     for (int i=1; i < nb_hashes+1; i++) {
         // elemsG2[i] = agg_pk[i]
-        ep2_sum_vector(elemsG2[i], (ep2_st*) &pks[offset] , pks_per_hash[i-1]);
+        E2_sum_vector(&tmp, &pks[offset] , pks_per_hash[i-1]);
+        ep2_st* relic_tmp = E2_blst_to_relic(&tmp);
+        ep2_copy(elemsG2[i], relic_tmp);
+        free(relic_tmp);
         offset += pks_per_hash[i-1];
     }
 
@@ -237,7 +241,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 // membership check of pks in G2 is not verified in this function
 // the membership check is separated to allow optimizing multiple verifications using the same pks
 int bls_verifyPerDistinctKey(const byte* sig, 
-                         const int nb_pks, const ep2_st* pks, const uint32_t* hashes_per_pk,
+                         const int nb_pks, const G2* pks, const uint32_t* hashes_per_pk,
                          const byte* hashes, const uint32_t* len_hashes){
 
     int ret = UNDEFINED; // return value
@@ -264,7 +268,9 @@ int bls_verifyPerDistinctKey(const byte* sig,
 
     // set the public keys
     for (int i=1; i < nb_pks+1; i++) {
-        ep2_copy(elemsG2[i], (ep2_st*) &pks[i-1]);
+        ep2_st* tmp = E2_blst_to_relic(&pks[i-1]);
+        ep2_copy(elemsG2[i], tmp);
+        free(tmp);
     }
 
     // map all hashes to G1 and aggregate the ones with the same public key
@@ -336,19 +342,17 @@ int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) {
     // deserialize the signature into a curve point
     int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN);
     if (read_ret != RLC_OK) {
-        printf("HHH1\n");
         return read_ret;
     }
 
     // check s is in G1
     if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
-        printf("HHH2\n");
         return INVALID;
     }
     
     return bls_verify_ep(pk, s, data, len);
 }
-/*
+
 
 // binary tree structure to be used by bls_batch verify.
 // Each node contains a signature and a public key, the signature (resp. the public key) 
@@ -391,7 +395,7 @@ static void free_tree(node* root) {
 }
 
 // builds a binary tree of aggregation of signatures and public keys recursively.
-static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) {
+static node* build_tree(const int len, const G2* pks, const ep_st* sigs) {
     // check if a leaf is reached
     if (len == 1) {
         return new_node(&pks[0], &sigs[0]);  // use the first element of the arrays
@@ -433,7 +437,6 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) {
 // verify the binary tree and fill the results using recursive batch verifications.
 static void bls_batchVerify_tree(const node* root, const int len, byte* results, 
         const byte* data, const int data_len) {
-
     // verify the aggregated signature against the aggregated public key.
     int res =  bls_verify_ep(root->pk, root->sig, data, data_len);
 
@@ -460,20 +463,22 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results,
 }
 
 // Batch verifies the validity of a multiple BLS signatures of the 
-// same message under multiple public keys.
+// same message under multiple public keys. Each signature at index `i` is verified
+// against the public key at index `i`.
 //
 // - membership checks of all signatures is verified upfront.
-// - use random coefficients for signatures and public keys at the same index.
+// - use random coefficients for signatures and public keys at the same index to prevent 
+//  indices mixup.
 // - optimize the verification by verifying an aggregated signature against an aggregated
 //  public key, and use a recursive verification to find invalid signatures.  
-void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input,
+void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input,
      const byte* sigs_bytes, const byte* data, const int data_len) {  
-
+    
     // initialize results to undefined
     memset(results, UNDEFINED, sigs_len);
     
     // build the arrays of G1 and G2 elements to verify
-    ep2_st* pks = (ep2_st*) malloc(sigs_len * sizeof(ep2_st));
+    G2* pks = (G2*) malloc(sigs_len * sizeof(G2));
     if (!pks) return;
     ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st));
     if (!sigs) goto out_sigs;
@@ -489,24 +494,30 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input,
         // the tree aggregations remain valid.
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
+        
+        // choose a random non-zero coefficient of at least 128 bits
+        // TODO: find a way to generate randoms
+        bn_rand(r, RLC_POS, SEC_BITS); 
+        bn_add_dig(r, r, 1); 
+        Fr* tmp = Fr_relic_to_blst(r);
+        // multiply public key by the random exponent
+        E2_mult(&pks[i], &pks_input[i], tmp);  
+
         int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if ( read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) {
-            if (read_ret == UNDEFINED) // unexpected error case 
+        if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) {
+            if (read_ret == UNDEFINED) {// unexpected error case 
                 goto out;
-            // set signature as infinity and set result as invald
-            ep_set_infty(&sigs[i]);
-            ep2_copy(&pks[i], (ep2_st*) &pks_input[i]);
-            results[i] = INVALID;
-        // multiply signatures and public keys at the same index by random coefficients
+            };
+            // set signature as infinity and set result as invalid
+            // this result won't be overwritten
+            ep_set_infty(&sigs[i]);   
+            results[i] = INVALID; 
         } else {
-            // random non-zero coefficient of a least 128 bits
-            bn_rand(r, RLC_POS, SEC_BITS);
-            bn_add_dig(r, r, 1); 
-            ep_mul_lwnaf(&sigs[i], &sigs[i], r);
-            ep2_mul_lwnaf(&pks[i], (ep2_st*) &pks_input[i], r);      
+            // multiply the signature by the same random exponent
+            ep_mul_lwnaf(&sigs[i], &sigs[i], r);   
         }
+        free(tmp);  
     }
-
     // build a binary tree of aggreagtions
     node* root = build_tree(sigs_len, &pks[0], &sigs[0]);
     if (!root) goto out;
@@ -515,15 +526,13 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input,
     bls_batchVerify_tree(root, sigs_len, &results[0], data, data_len);
     // free the allocated tree 
     free_tree(root);
-
+    
 out:
     bn_free(r);  
     for (int i=0; i < sigs_len; i++) {
         ep_free(sigs[i]);
-        ep2_free(pks[i]);
     }
     free(sigs); 
 out_sigs:
     free(pks);
 }
-*/
\ No newline at end of file
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 25bdf2020a7..32b9f506c8c 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -38,11 +38,11 @@ int      get_sk_len();
 void     bls_sign(byte*, const Fr*, const byte*, const int);
 int      bls_verify(const G2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
-                         const uint32_t*, const ep2_st*);
+                         const uint32_t*, const G2*);
 int      bls_verifyPerDistinctKey(const byte*, 
-                         const int, const ep2_st*, const uint32_t*,
+                         const int, const G2*, const uint32_t*,
                          const byte*, const uint32_t*);
-void     bls_batchVerify(const int, byte*, const ep2_st*,
+void     bls_batchVerify(const int, byte*, const G2*,
             const byte*, const byte*, const int);
 
 #endif
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index cf293726112..e9139183c3f 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 import (
 	"errors"
 	"fmt"
@@ -14,7 +13,7 @@ import (
 
 	"github.com/onflow/flow-go/crypto/hash"
 	_ "github.com/onflow/flow-go/crypto/hash"
-)*/
+)
 
 // BLS multi-signature using BLS12-381 curve
 // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381)
@@ -44,7 +43,6 @@ import "C"
 // used for signatures.
 var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite)
 
-/*
 // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key.
 //
 // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used
@@ -195,13 +193,13 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 	}
 
 	var sum pointG2
-	C.ep2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]),
+	C.E2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]),
 		(C.int)(len(points)))
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
 	return sumKey, nil
 }
-*/
+
 // IdentityBLSPublicKey returns an identity public key which corresponds to the point
 // at infinity in G2 (identity element of G2).
 // TODO: return a constant key instead of a newly allocated one
@@ -214,8 +212,6 @@ func IdentityBLSPublicKey() PublicKey {
 	return &identity
 }
 
-/*
-
 // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key.
 //
 // The common use case assumes the aggregated public key was initially formed using
@@ -252,7 +248,7 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey,
 	}
 
 	var resultPoint pointG2
-	C.ep2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point),
+	C.E2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point),
 		(*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract)))
 
 	resultKey := newPubKeyBLSBLS12381(&resultPoint)
@@ -574,4 +570,3 @@ var invalidSignatureError = errors.New("input signature does not deserialize to
 func IsInvalidSignatureError(err error) bool {
 	return errors.Is(err, invalidSignatureError)
 }
-*/
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 2965326fb66..ad29b088481 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -6,10 +6,10 @@ package crypto
 import (
 	"crypto/rand"
 	"encoding/hex"
-	_ "fmt"
+	"fmt"
 	mrand "math/rand"
 	"testing"
-	_ "time"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -122,14 +122,14 @@ func TestBLSBLS12381Hasher(t *testing.T) {
 		h := internalExpandMsgXOFKMAC128(blsSigCipherSuite)
 		assert.NotNil(t, h)
 	})
-	/*
-		t.Run("constants sanity check", func(t *testing.T) {
-			// test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve
-			// The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long
-			assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16)
-			assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
-		})
-	*/
+
+	t.Run("constants sanity check", func(t *testing.T) {
+		// test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve
+		// The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long
+		assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16)
+		assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16)
+	})
+
 	t.Run("orthogonal PoP and signature hashing", func(t *testing.T) {
 		data := []byte("random_data")
 		// empty tag hasher
@@ -214,7 +214,6 @@ func TestBLSUtils(t *testing.T) {
 	testKeySize(t, sk, PrKeyLenBLSBLS12381, PubKeyLenBLSBLS12381)
 }
 
-/*
 // BLS Proof of Possession test
 func TestBLSPOP(t *testing.T) {
 	r := time.Now().UnixNano()
@@ -651,9 +650,9 @@ func TestBLSBatchVerify(t *testing.T) {
 	t.Run("one valid signature", func(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac)
 		require.NoError(t, err)
-		assert.Equal(t, valid, expectedValid[:1],
+		assert.Equal(t, expectedValid[:1], valid,
 			"Verification of %s failed, private keys are %s, input is %x, results is %v",
-			sigs, sks, input, valid)
+			sigs[:1], sks[:1], input, valid)
 	})
 
 	// pick a random number of invalid signatures
@@ -1112,4 +1111,3 @@ func TestBLSIdentity(t *testing.T) {
 		assert.False(t, valid)
 	})
 }
-*/
diff --git a/crypto/build_dependency.sh b/crypto/build_dependency.sh
index bd5d612e9cb..4bfe99dbad2 100644
--- a/crypto/build_dependency.sh
+++ b/crypto/build_dependency.sh
@@ -14,7 +14,7 @@ fi
 rm -rf "${RELIC_DIR}"
 
 # relic version or tag
-relic_version="05feb20da8507260c9b3736dc1fd2efe7876d812"
+relic_version="7d885d1ba34be61bf22190943a73549a910c1714"
 
 # clone a specific version of Relic without history if it's tagged.
 # git -c http.sslVerify=true clone --branch $(relic_version) --single-branch --depth 1 https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; }
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 7ee0a9773d5..be62d3f5a73 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -309,12 +309,12 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2
 		(C.int)(qualified))
 	// sum up Y
 	var jointPublicKey pointG2
-	C.ep2_sum_vector((*C.G2)(&jointPublicKey),
+	C.E2_sum_vector((*C.G2)(&jointPublicKey),
 		(*C.G2)(&qualifiedPubKey[0]), (C.int)(qualified))
 	// sum up []y
 	jointy := make([]pointG2, s.size)
 	for i := 0; i < s.size; i++ {
-		C.ep2_sum_vector((*C.G2)(&jointy[i]),
+		C.E2_sum_vector((*C.G2)(&jointy[i]),
 			(*C.G2)(&qualifiedy[i][0]), (C.int)(qualified))
 	}
 	return &jointx, &jointPublicKey, jointy
diff --git a/crypto/relic_build.sh b/crypto/relic_build.sh
index 3045e22f59e..62f21ec5db5 100755
--- a/crypto/relic_build.sh
+++ b/crypto/relic_build.sh
@@ -63,9 +63,9 @@ PRIME=(-DFP_PRIME=381)
 #
 BN_METH=(-DBN_KARAT=0 -DBN_METHD="COMBA;COMBA;MONTY;SLIDE;BINAR;BASIC")
 FP_METH=(-DFP_KARAT=0 -DFP_METHD="INTEG;INTEG;INTEG;MONTY;MONTY;JMPDS;SLIDE")
-PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON -DFP_WIDTH=2)
+PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON)
 FPX_METH=(-DFPX_METHD="INTEG;INTEG;LAZYR")
-EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF -DEP_DEPTH=4 -DEP_WIDTH=2 \
+EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF \
     -DEP_CTMAP=ON -DEP_METHD="JACOB;LWNAF;COMBS;INTER")
 PP_METH=(-DPP_METHD="LAZYR;OATEP")
 

From 444f75520509fd14945b25ec3fd855f8969ba0fd Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 14 Apr 2023 21:46:03 -0600
Subject: [PATCH 029/200] update  BLSBatchVerify with regards to invalid
 signature format

---
 crypto/bls12381_utils.c | 12 +++++------
 crypto/bls_core.c       | 26 +++++++++++-------------
 crypto/bls_multisig.go  | 45 ++++++++++++++++++++++++++---------------
 crypto/bls_test.go      | 30 ++++++++++++++++++++++++++-
 4 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 423a0a890b0..83569661ab1 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -174,7 +174,7 @@ void Fr_copy(Fr* res, const Fr* a) {
 
 // sets `a` to 0
 void Fr_set_zero(Fr* a){
-    vec_zero((byte*)a, Fr_BYTES);
+    vec_zero((byte*)a, sizeof(Fr));
 }
 
 void Fr_add(Fr *res, const Fr *a, const Fr *b) {
@@ -313,7 +313,7 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
     if (!check_mod_256(tmp, BLS12_381_r)) {  // check_mod_256 compares pow256 against a vec256!
         return BLST_BAD_SCALAR;
     }
-    vec_zero(tmp, Fr_BYTES);
+    vec_zero(tmp, sizeof(tmp));
     limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
     return BLST_SUCCESS;
 }
@@ -383,12 +383,12 @@ const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P };  /* (1<<384)%p */
 
 // sets `a` to 0
 void Fp_set_zero(Fp* a){
-    vec_zero((byte*)a, Fp_BYTES);
+    vec_zero((byte*)a, sizeof(Fp));
 }
 
 // sets `a` to limb `l`
 void Fp_set_limb(Fp* a, const limb_t l){
-    vec_zero((byte*)a + sizeof(limb_t), Fp_BYTES - sizeof(limb_t));
+    vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t));
     *((limb_t*)a) = l;
 }
 
@@ -949,12 +949,12 @@ void E2_write_bytes(byte *bin, const G2* a) {
 
 // set p to infinity
 void E2_set_infty(G2* p) {
-    vec_zero(p, G2_BYTES);
+    vec_zero(p, sizeof(G2));
 }
 
 // check if `p` is infinity
 bool_t E2_is_infty(const G2* p) {
-    return vec_is_zero(p, sizeof(*p));
+    return vec_is_zero(p, sizeof(G2));
 }
 
 // checks affine point `p` is in E2
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 5764ff64da8..e89bf755e4e 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -494,29 +494,27 @@ void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input,
         // the tree aggregations remain valid.
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
-        
-        // choose a random non-zero coefficient of at least 128 bits
-        // TODO: find a way to generate randoms
-        bn_rand(r, RLC_POS, SEC_BITS); 
-        bn_add_dig(r, r, 1); 
-        Fr* tmp = Fr_relic_to_blst(r);
-        // multiply public key by the random exponent
-        E2_mult(&pks[i], &pks_input[i], tmp);  
-
         int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
         if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) {
             if (read_ret == UNDEFINED) {// unexpected error case 
                 goto out;
             };
-            // set signature as infinity and set result as invalid
-            // this result won't be overwritten
+            // set signature and key to infinity (no effect on the aggregation tree)
+            // and set result to invalid (result won't be overwritten)
+            E2_set_infty(&pks[i]);
             ep_set_infty(&sigs[i]);   
             results[i] = INVALID; 
         } else {
-            // multiply the signature by the same random exponent
+            // choose a random non-zero coefficient of at least 128 bits
+            // TODO: find a way to generate randoms
+            bn_rand(r, RLC_POS, SEC_BITS); 
+            bn_add_dig(r, r, 1); 
+            Fr* tmp = Fr_relic_to_blst(r);
+            // multiply public key and signature by the same random exponent
+            E2_mult(&pks[i], &pks_input[i], tmp); 
+            free(tmp);  
             ep_mul_lwnaf(&sigs[i], &sigs[i], r);   
-        }
-        free(tmp);  
+        } 
     }
     // build a binary tree of aggreagtions
     node* root = build_tree(sigs_len, &pks[0], &sigs[0]);
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index e9139183c3f..d074825e0e2 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -472,7 +472,6 @@ func VerifyBLSSignatureManyMessages(
 func BatchVerifyBLSSignaturesOneMessage(
 	pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher,
 ) ([]bool, error) {
-
 	// empty list check
 	if len(pks) == 0 {
 		return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError)
@@ -485,38 +484,48 @@ func BatchVerifyBLSSignaturesOneMessage(
 			len(sigs))
 	}
 
-	verifBool := make([]bool, len(sigs))
+	// return boolean array
+	returnBool := make([]bool, len(sigs))
+	// temporary boolean array to hold the return values till all the return values are set
+	tmpBool := make([]bool, len(sigs))
+	for i := range tmpBool {
+		tmpBool[i] = true // default to true
+	}
 	if err := checkBLSHasher(kmac); err != nil {
-		return verifBool, err
+		return returnBool, err
 	}
 
-	// an invalid signature with an incorrect header but correct length
-	invalidSig := make([]byte, signatureLengthBLSBLS12381)
-	invalidSig[0] = invalidBLSSignatureHeader // incorrect header
-
 	// flatten the shares (required by the C layer)
 	flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs))
 	pkPoints := make([]pointG2, 0, len(pks))
 
+	getIdentityPoint := func() pointG2 {
+		pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true
+		return pk.point
+	}
+
 	for i, pk := range pks {
 		pkBLS, ok := pk.(*pubKeyBLSBLS12381)
 		if !ok {
-			return verifBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError)
+			return returnBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError)
 		}
-		pkPoints = append(pkPoints, pkBLS.point)
 
 		if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity {
-			// force the signature to be invalid by replacing it with an invalid array
-			// that fails the deserialization in C.ep_read_bin_compact
-			flatSigs = append(flatSigs, invalidSig...)
+			// case of invalid signature: set the signature and public key at index `i`
+			// to identities so that there is no effect on the aggregation tree computation.
+			// However, the boolean return for index `i` is set to `false` and won't be overwritten.
+			tmpBool[i] = false
+			pkPoints = append(pkPoints, getIdentityPoint())
+			flatSigs = append(flatSigs, identityBLSSignature...)
 		} else {
+			pkPoints = append(pkPoints, pkBLS.point)
 			flatSigs = append(flatSigs, sigs[i]...)
 		}
 	}
 
 	// hash the input to 128 bytes
 	h := kmac.ComputeHash(message)
-	verifInt := make([]byte, len(verifBool))
+	verifInt := make([]byte, len(returnBool))
 
 	C.bls_batchVerify(
 		(C.int)(len(verifInt)),
@@ -529,12 +538,16 @@ func BatchVerifyBLSSignaturesOneMessage(
 
 	for i, v := range verifInt {
 		if (C.int)(v) != valid && (C.int)(v) != invalid {
-			return verifBool, fmt.Errorf("batch verification failed")
+			return returnBool, fmt.Errorf("batch verification failed")
+		}
+		if tmpBool[i] { // only overwrite if not previously written
+			tmpBool[i] = ((C.int)(v) == valid)
 		}
-		verifBool[i] = ((C.int)(v) == valid)
 	}
 
-	return verifBool, nil
+	// make sure returnBool is []false till this point
+	copy(returnBool, tmpBool)
+	return returnBool, nil
 }
 
 // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys)
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index ad29b088481..703ec9784b8 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -646,6 +646,27 @@ func TestBLSBatchVerify(t *testing.T) {
 			sigs, sks, input, valid)
 	})
 
+	// valid signatures but indices aren't correct: sig[i] is correct under pks[j]
+	// and sig[j] is correct under pks[j].
+	// implementations simply aggregating all signatures and keys would fail this test.
+	t.Run("valid signatures with incorrect indices", func(t *testing.T) {
+		i := mrand.Intn(sigsNum-1) + 1
+		j := mrand.Intn(i)
+		// swap correct keys
+		pks[i], pks[j] = pks[j], pks[i]
+
+		valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac)
+		require.NoError(t, err)
+		expectedValid[i], expectedValid[j] = false, false
+		assert.Equal(t, valid, expectedValid,
+			"Verification of %s failed, private keys are %s, input is %x, results is %v",
+			sigs, sks, input, valid)
+
+		// restore keys
+		pks[i], pks[j] = pks[j], pks[i]
+		expectedValid[i], expectedValid[j] = true, true
+	})
+
 	// one valid signature
 	t.Run("one valid signature", func(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac)
@@ -745,6 +766,13 @@ func TestBLSBatchVerify(t *testing.T) {
 	})
 }
 
+// Utility function that flips a point sign bit to negate the point
+// this is shortcut which works only for zcash BLS12-381 compressed serialization
+// Applicable to both signatures and public keys
+func negatePoint(pointbytes []byte) {
+	pointbytes[0] ^= 0x20
+}
+
 // alter or fix a signature
 func alterSignature(s Signature) {
 	// this causes the signature to remain in G1 and be invalid
@@ -1080,7 +1108,7 @@ func TestBLSIdentity(t *testing.T) {
 		require.NoError(t, err)
 		oppositeSig := make([]byte, signatureLengthBLSBLS12381)
 		copy(oppositeSig, sig)
-		oppositeSig[0] ^= 0x20 // flip the last 3rd bit to flip the point sign
+		negatePoint(oppositeSig)
 		aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig})
 		require.NoError(t, err)
 		assert.True(t, IsBLSSignatureIdentity(aggSig))

From d70883bbbf9a4ddf3b2dab1612add7e0b3741b44 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 14 Apr 2023 22:37:34 -0600
Subject: [PATCH 030/200] fix a bug and minor updates

---
 crypto/bls12381_utils.c   |  2 +-
 crypto/sign_test_utils.go | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 83569661ab1..38c665329a1 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -393,7 +393,7 @@ void Fp_set_limb(Fp* a, const limb_t l){
 }
 
 void Fp_copy(Fp* res, const Fp* a) {
-    vec_copy((byte*)res, (byte*)a, sizeof(Fr));
+    vec_copy((byte*)res, (byte*)a, sizeof(Fp));
 }
 
 static void Fp_add(Fp *res, const Fp *a, const Fp *b) {
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 8a81e5bb45a..93895429dbe 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -47,7 +47,7 @@ func TestHasherErrors(t *testing.T) {
 
 // tests sign and verify are consistent for multiple generated keys and messages
 func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
-	t.Run(fmt.Sprintf("Testing Generation/Signature/Verification for %s", salg), func(t *testing.T) {
+	t.Run(fmt.Sprintf("Generation/Signature/Verification for %s", salg), func(t *testing.T) {
 		seed := make([]byte, KeyGenSeedMinLen)
 		input := make([]byte, 100)
 		r := time.Now().UnixNano()
@@ -100,7 +100,6 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
 			require.NoError(t, err)
 			assert.False(t, result, fmt.Sprintf(
 				"Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen))
-
 		}
 	})
 }
@@ -154,7 +153,7 @@ func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int)
 }
 
 func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
-	t.Run(fmt.Sprintf("Testing encode/decode for %s", salg), func(t *testing.T) {
+	t.Run(fmt.Sprintf("encode/decode for %s", salg), func(t *testing.T) {
 		r := time.Now().UnixNano()
 		mrand.Seed(r)
 		t.Logf("math rand seed is %d", r)
@@ -259,7 +258,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 }
 
 func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) {
-	t.Run(fmt.Sprintf("Testing Equals for %s", salg), func(t *testing.T) {
+	t.Run(fmt.Sprintf("equals for %s", salg), func(t *testing.T) {
 		r := time.Now().UnixNano()
 		mrand.Seed(r)
 		t.Logf("math rand seed is %d", r)
@@ -304,7 +303,7 @@ func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorit
 }
 
 func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) {
-	t.Run(fmt.Sprintf("Testing key.Algorithm for %s", salg), func(t *testing.T) {
+	t.Run(fmt.Sprintf("key.Algorithm for %s", salg), func(t *testing.T) {
 		alg := sk.Algorithm()
 		assert.Equal(t, alg, salg)
 		alg = sk.PublicKey().Algorithm()
@@ -313,7 +312,7 @@ func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) {
 }
 
 func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) {
-	t.Run(fmt.Sprintf("Testing key.Size for %s", sk.Algorithm()), func(t *testing.T) {
+	t.Run(fmt.Sprintf("key.Size for %s", sk.Algorithm()), func(t *testing.T) {
 		size := sk.Size()
 		assert.Equal(t, size, skLen)
 		size = sk.PublicKey().Size()

From 54ee84ad291752c2577828b5950b686d66264633 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 15 Apr 2023 19:31:48 -0600
Subject: [PATCH 031/200] BLS threshold signature works with new G2 type

---
 crypto/bls12381_utils.c            | 14 +++++-
 crypto/bls12381_utils.h            |  1 +
 crypto/bls_thresholdsign.go        |  4 +-
 crypto/bls_thresholdsign_include.h |  2 +-
 crypto/bls_thresholdsign_test.go   |  7 ++-
 crypto/dkg.go                      |  3 --
 crypto/dkg_core.c                  | 73 +++++++++++-------------------
 crypto/dkg_feldmanvss.go           | 19 ++++----
 crypto/dkg_feldmanvssq.go          |  7 ++-
 crypto/dkg_include.h               | 12 ++---
 crypto/dkg_jointfeldman.go         |  3 --
 crypto/dkg_test.go                 | 11 ++---
 crypto/thresholdsign.go            |  2 -
 13 files changed, 67 insertions(+), 91 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 38c665329a1..61b54bb2686 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -293,6 +293,7 @@ static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
     }
 }
 
+// internal type of BLST `pow256` uses bytes little endian.
 static void pow256_from_Fr(pow256 ret, const Fr* in) {
     le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES);
 }
@@ -998,10 +999,19 @@ static void E2_neg(G2* a) {
 }
 
 // Exponentiation of a generic point `a` in E2, res = expo.a
-void E2_mult(G2* res, const G2* a, const Fr* expo) {
+void E2_mult(G2* res, const G2* p, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
-    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)a, tmp);
+    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp);
+}
+
+// Exponentiation of a generic point `a` in E2 by a byte exponent.
+void  E2_mult_small_expo(G2* res, const G2* p, const byte expo) {
+    pow256 pow_expo; // `pow256` uses bytes little endian.
+    pow_expo[0] = expo;
+    vec_zero(&pow_expo[1], 32-1);
+    // TODO: to bench against a specific version of mult with 8 bits expo
+    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, pow_expo);
 }
 
 // Exponentiation of generator g2 of G2, res = expo.g2
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index a5f636c2655..a67932fd43b 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -147,6 +147,7 @@ BLST_ERROR  E2_read_bytes(G2*, const byte *,  const int);
 void        E2_write_bytes(byte *, const G2*);
 void        G2_mult_gen(G2*, const Fr*);
 void        E2_mult(G2*, const G2*, const Fr*);
+void        E2_mult_small_expo(G2*, const G2*, const byte);
 void        E2_add(G2* res, const G2* a, const G2* b);
 void        E2_sum_vector(G2*, const G2*, const int);
 
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 094f4ebc692..3fec93d96f5 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
 import "C"
@@ -580,7 +579,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	}
 	// compute the shares
 	for i := index(1); int(i) <= size; i++ {
-		C.Fr_polynomialImage(
+		C.Fr_polynomial_image(
 			(*C.Fr)(&x[i-1]),
 			(*C.G2)(&y[i-1]),
 			(*C.Fr)(&a[0]), (C.int)(len(a)),
@@ -604,4 +603,3 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	// generating an identity key is therefore negligible.
 	return skShares, pkShares, pkGroup, nil
 }
-*/
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index b3e68f46328..861ba552241 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -6,6 +6,6 @@
 #include "bls_include.h"
 
 int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int);
-extern void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
+extern void Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 04fe28d4db4..6d873da6e68 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 import (
 	"crypto/rand"
 	"fmt"
@@ -22,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) {
 	t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI)
 	// stateful API
 	t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI)
-	t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
-	t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
+	//t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
+	//t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
 }
 
 const thresholdSignatureTag = "random tag"
@@ -648,4 +647,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) {
 		require.NoError(b, err)
 	}
 	b.StopTimer()
-}*/
+}
diff --git a/crypto/dkg.go b/crypto/dkg.go
index 1254db615f3..1cdf87a128e 100644
--- a/crypto/dkg.go
+++ b/crypto/dkg.go
@@ -1,7 +1,5 @@
 package crypto
 
-/*
-
 import (
 	"errors"
 	"fmt"
@@ -237,4 +235,3 @@ type DKGProcessor interface {
 	// log describes the misbehavior.
 	FlagMisbehavior(participant int, log string)
 }
-*/
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 9bf9dd8b2fc..0dd4844c08b 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -1,7 +1,7 @@
 // +build relic
 
 #include "dkg_include.h"
-/*
+
 
 #define N_max 250
 #define N_bits_max 8  // log(250)  
@@ -11,9 +11,9 @@
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
-void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x){
+void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x){
     Fr image;
-    Fr_polynomialImage(&image, y, a, a_size, x);
+    Fr_polynomial_image(&image, y, a, a_size, x);
     // exports the result
     Fr_write_bytes(out, &image);
 }
@@ -21,7 +21,7 @@ void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
 // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255.
 // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL
-void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const byte x){
+void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const byte x){
     Fr_set_zero(image); 
     // convert `x` to Montgomery form
     Fr xR;
@@ -34,78 +34,59 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const
     }
     // compute y = P(x).g2
     if (y) {
-        bn_st* tmp = Fr_blst_to_relic(image);
-        g2_mul_gen(y, tmp);
-        free(tmp);
+        G2_mult_gen(y, image);
     }
 }
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 // and stores the point in y
-// r is the order of G2
-static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, const byte x){
-    
-    bn_t bn_x;        
-    bn_new(bn_x);    
-    ep2_set_infty(y);
-    bn_set_dig(bn_x, x);
+static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte x){        
+    E2_set_infty(y);
     for (int i = len_A-1; i >= 0 ; i--) {
-        ep2_mul_lwnaf(y, y, bn_x);
-        ep2_add_projc(y, y, (ep2_st*)&A[i]);
+        E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo
+        E2_add(y, y, &A[i]);
     }
-
-    ep2_norm(y, y); // not necessary but called to optimize the 
-                    // multiple pairing computations with the same public key
-    bn_free(bn_x);
 }
 
+
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
 // where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
-void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int len_A) {
+void G2_polynomial_images(G2 *y, const int len_y, const G2* A, const int len_A) {
     for (byte i=0; i<len_y; i++) {
         //y[i] = Q(i+1)
-        G2_polynomialImage(y+i , A, len_A, i+1);
+        G2_polynomial_image(y+i , A, len_A, i+1);
     }
 }
 
-// export an array of ep2_st into an array of bytes
-// the array must be of length (len * G2_SER_BYTES)
-void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len) {
-    const int size = (G2_BYTES/(G2_SERIALIZATION+1));
+// export an array of G2 into an array of bytes by concatenating
+// all serializations of G2 points in order.
+// the array must be of length (len * G2_SER_BYTES).
+void G2_vector_write_bytes(byte* out, const G2* A, const int len) {
     byte* p = out;
     for (int i=0; i<len; i++){
         E2_write_bytes(p, &A[i]);
-        p += size;
+        p += G2_SER_BYTES;
     }
 }
 
-// The function imports an array of ep2_st from an array of bytes
-// the length matching is supposed to be already done.
-//
-// It returns RLC_OK if reading all the vector succeeded and RLC_ERR 
-// otherwise.
-int G2_vector_read_bin(ep2_st* A, const byte* src, const int len){
-    const int size = (G2_BYTES/(G2_SERIALIZATION+1));
+// The function imports an array of G2 from a concatenated array of bytes.
+// The bytes array is supposed to be in (len * G2_SER_BYTES) 
+BLST_ERROR G2_vector_read_bytes(G2* A, const byte* src, const int len){
     byte* p = (byte*) src;
     for (int i=0; i<len; i++){
-        int read_ret = E2_read_bytes(&A[i], p, size);
+        int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
         if (read_ret != BLST_SUCCESS)
             return read_ret;
-        p += size;
+        p += G2_SER_BYTES;
     }
     // TODO: add G2 subgroup check
-    return RLC_OK;
+    return BLST_SUCCESS;
 }
 
 // returns 1 if g2^x = y, where g2 is the generator of G2
 // returns 0 otherwise
-int verifyshare(const Fr* x, const ep2_t y) {
-    ep2_t res;
-    ep2_new(res);
-    bn_st* x_tmp = Fr_blst_to_relic(x);
-    g2_mul_gen(res, x_tmp);
-    free(x_tmp);
-    return (ep2_cmp(res, (ep2_st*)y) == RLC_EQ);
+bool_t verify_share(const Fr* x, const G2* y) {
+    G2 tmp;
+    G2_mult_gen(&tmp, x);
+    return E2_is_equal(&tmp, y);
 }
-
-*/
\ No newline at end of file
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index a5eb5412cc8..659b76354bd 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -3,8 +3,6 @@
 
 package crypto
 
-/*
-
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -414,7 +412,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 // P(x) is written in dest, while g2^P(x) is written in y
 // x being a small integer
 func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
-	C.Fr_polynomialImage_export((*C.uchar)(&dest[0]),
+	C.Fr_polynomial_image_export((*C.uchar)(&dest[0]),
 		(*C.G2)(y),
 		(*C.Fr)(&a[0]), (C.int)(len(a)),
 		(C.uint8_t)(x),
@@ -424,7 +422,7 @@ func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
 // writeVerifVector exports a vector A into an array of bytes
 // assuming the array length matches the vector length
 func writeVerifVector(dest []byte, A []pointG2) {
-	C.ep2_vector_write_bin((*C.uchar)(&dest[0]),
+	C.G2_vector_write_bytes((*C.uchar)(&dest[0]),
 		(*C.G2)(&A[0]),
 		(C.int)(len(A)),
 	)
@@ -433,10 +431,11 @@ func writeVerifVector(dest []byte, A []pointG2) {
 // readVerifVector imports A vector from an array of bytes,
 // assuming the slice length matches the vector length
 func readVerifVector(A []pointG2, src []byte) error {
-	read := C.G2_vector_read_bin((*C.G2)(&A[0]),
+	read := C.G2_vector_read_bytes(
+		(*C.G2)(&A[0]),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(A)))
-	if read == valid {
+	if int(read) == blst_valid {
 		return nil
 	}
 	// invalid A vector
@@ -445,8 +444,9 @@ func readVerifVector(A []pointG2, src []byte) error {
 
 func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
-	return C.verifyshare((*C.Fr)(&s.x),
-		(*C.G2)(&s.y[s.myIndex])) == 1
+	return C.verify_share(
+		(*C.Fr)(&s.x),
+		(*C.G2)(&s.y[s.myIndex])) != 0
 }
 
 // computePublicKeys extracts the participants public keys from the verification vector
@@ -454,9 +454,8 @@ func (s *feldmanVSSstate) verifyShare() bool {
 //
 //	Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 func (s *feldmanVSSstate) computePublicKeys() {
-	C.G2_polynomialImages(
+	C.G2_polynomial_images(
 		(*C.G2)(&s.y[0]), (C.int)(len(s.y)),
 		(*C.G2)(&s.vA[0]), (C.int)(len(s.vA)),
 	)
 }
-*/
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index bfdea5b66ba..2dfe25a6cb0 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
@@ -509,8 +508,9 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 // - true if the complaint answer is not correct
 func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool {
 	// check y[complainer] == share.G2
-	return C.verifyshare((*C.Fr)(&c.answer),
-		(*C.G2)(&s.y[complainer])) == 0
+	return C.verify_share(
+		(*C.Fr)(&c.answer),
+		(*C.G2)(&s.y[complainer])) != 0
 }
 
 // data = |complainee|
@@ -669,4 +669,3 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 		}
 	}
 }
-*/
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 22e755d55ae..0aaf1296b53 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,11 +5,11 @@
 
 #include "bls12381_utils.h"
 
-void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x);
-void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x);
-void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A);
-void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len);
-int  G2_vector_read_bin(ep2_st* A, const byte* src, const int len);
-int  verifyshare(const Fr* x, const ep2_t y);
+void        Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x);
+void        Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x);
+void        G2_polynomial_images(G2* y, const int len_y, const G2* A, const int len_A);
+void        G2_vector_write_bytes(byte* out, const G2* A, const int len);
+BLST_ERROR  G2_vector_read_bytes(G2* A, const byte* src, const int len);
+bool_t      verify_share(const Fr* x, const G2* y);
 
 #endif
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index be62d3f5a73..21d06e1470e 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -3,8 +3,6 @@
 
 package crypto
 
-/*
-
 // #cgo CFLAGS:
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
@@ -340,4 +338,3 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2
 	}
 	return qualifiedx, qualifiedPubKey, qualifiedy
 }
-*/
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index 104cb8ef56f..ff96730b855 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 import (
 	"fmt"
 	mrand "math/rand"
@@ -19,9 +18,9 @@ import (
 var gt *testing.T
 
 func TestDKG(t *testing.T) {
-	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
+	//t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
 	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
-	t.Run("JointFeldman", testJointFeldman)
+	//t.Run("JointFeldman", testJointFeldman)
 }
 
 // optimal threshold (t) to allow the largest number of malicious participants (m)
@@ -369,7 +368,6 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 				"2 group public keys are mismatching")
 		}
 	}
-
 }
 
 // time after which a silent channel causes switching to the next dkg phase
@@ -594,13 +592,12 @@ func (proc *testDKGProcessor) invalidShareSend(dest int, data []byte) {
 		}
 
 	} else {
-		gt.Logf("turns out to be a honest send\n%x\n", data)
+		gt.Logf("%d to %d: turns out to be a honest send\n%x\n", data, proc.current, dest)
 	}
 	// honest send case: this is the only message sent
 	// malicious send case: this is a second correct send, to test the second message gets ignored
 	// by the receiver (sender has been tagged malicious after the first send)
 	proc.chans[dest] <- originalMsg
-
 }
 
 // This is a testing function
@@ -834,4 +831,4 @@ func TestDKGTransitionErrors(t *testing.T) {
 			assert.True(t, IsDKGInvalidStateTransitionError(err))
 		}
 	})
-}*/
+}
diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go
index 63c4231beff..2dae7061b76 100644
--- a/crypto/thresholdsign.go
+++ b/crypto/thresholdsign.go
@@ -1,6 +1,5 @@
 package crypto
 
-/*
 import (
 	"errors"
 	"fmt"
@@ -145,4 +144,3 @@ func IsNotEnoughSharesError(err error) bool {
 	var target *notEnoughSharesError
 	return errors.As(err, &target)
 }
-*/

From ab67ed4e8e25d0d8de2cad35af6b9456e3c28b1a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 15 Apr 2023 19:36:27 -0600
Subject: [PATCH 032/200] DKG works with new G2 type

---
 crypto/dkg_feldmanvssq.go | 2 +-
 crypto/dkg_test.go        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 2dfe25a6cb0..0bf5ad6445c 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -510,7 +510,7 @@ func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) boo
 	// check y[complainer] == share.G2
 	return C.verify_share(
 		(*C.Fr)(&c.answer),
-		(*C.G2)(&s.y[complainer])) != 0
+		(*C.G2)(&s.y[complainer])) == 0
 }
 
 // data = |complainee|
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index ff96730b855..a35d259f4f2 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -18,9 +18,9 @@ import (
 var gt *testing.T
 
 func TestDKG(t *testing.T) {
-	//t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
+	t.Run("FeldmanVSSSimple", testFeldmanVSSSimple)
 	t.Run("FeldmanVSSQual", testFeldmanVSSQual)
-	//t.Run("JointFeldman", testJointFeldman)
+	t.Run("JointFeldman", testJointFeldman)
 }
 
 // optimal threshold (t) to allow the largest number of malicious participants (m)

From 693029f964f95f3e27c871180bccb6398af09b6a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 15 Apr 2023 19:42:24 -0600
Subject: [PATCH 033/200] BLS-SPoCK works with new G2 type

---
 crypto/bls12381_utils.c | 9 ++++++---
 crypto/bls12381_utils.h | 2 +-
 crypto/spock.go         | 2 --
 crypto/spock_test.go    | 2 --
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 61b54bb2686..64ed4fae82c 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1037,7 +1037,7 @@ void E2_sum_vector(G2* jointy, const G2* y, const int len){
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications 
 // using the same public keys.
-int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const byte* sig2) {  
+int bls_spock_verify(const G2* pk1, const byte* sig1, const G2* pk2, const byte* sig2) {  
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -1063,11 +1063,14 @@ int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const b
 
     // elemsG2[1] = pk1
     ep2_new(elemsG2[1]);
-    ep2_copy(elemsG2[1], (ep2_st*)pk1);
+    ep2_st* tmp = E2_blst_to_relic(pk1);
+    ep2_copy(elemsG2[1], tmp);
 
     // elemsG2[0] = pk2
     ep2_new(elemsG2[0]);
-    ep2_copy(elemsG2[0], (ep2_st*)pk2);
+    tmp = E2_blst_to_relic(pk2);
+    ep2_copy(elemsG2[0], tmp);
+    free(tmp);
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -pk2
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index a67932fd43b..471f2bc7bcc 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -90,7 +90,7 @@ int      get_invalid();
 int      get_Fr_BYTES();
 
 // BLS based SPoCK
-int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*);
+int bls_spock_verify(const G2*, const byte*, const G2*, const byte*);
 
 // hash to curve functions (functions in bls12381_hashtocurve.c)
 void     map_to_G1(ep_t, const byte*, const int);
diff --git a/crypto/spock.go b/crypto/spock.go
index 18c39f8af15..ce80a7f2275 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 // SPoCK design based on the BLS signature scheme.
 // BLS is using BLS12-381 curve and the same settings in bls.go.
 
@@ -105,4 +104,3 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 		return false, fmt.Errorf("SPoCK verification failed")
 	}
 }
-*/
diff --git a/crypto/spock_test.go b/crypto/spock_test.go
index 408e513bae0..45db590f04e 100644
--- a/crypto/spock_test.go
+++ b/crypto/spock_test.go
@@ -3,7 +3,6 @@
 
 package crypto
 
-/*
 import (
 	"crypto/rand"
 	"testing"
@@ -184,4 +183,3 @@ func TestSPOCKProveVerify(t *testing.T) {
 		assert.False(t, result)
 	})
 }
-*/

From 1a937638aa2541b61b38b2e4ab7d7ed93fd82c77 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 17 Apr 2023 14:47:03 -0600
Subject: [PATCH 034/200] clean up and G2 to E2 renaming

---
 crypto/bls.go                      | 12 +++----
 crypto/bls12381_utils.c            | 46 ++++++++++++------------
 crypto/bls12381_utils.go           | 58 +++++++++++++++---------------
 crypto/bls12381_utils.h            | 36 +++++++++----------
 crypto/bls12381_utils_test.go      | 34 +++++++++---------
 crypto/bls_core.c                  | 26 +++++++-------
 crypto/bls_crossBLST_test.go       |  2 +-
 crypto/bls_include.h               |  8 ++---
 crypto/bls_multisig.go             | 36 +++++++++----------
 crypto/bls_thresholdsign.go        |  6 ++--
 crypto/bls_thresholdsign_include.h |  2 +-
 crypto/blst_include.h              | 22 ++++++------
 crypto/dkg_core.c                  | 20 +++++------
 crypto/dkg_feldmanvss.go           | 34 +++++++++---------
 crypto/dkg_feldmanvssq.go          |  8 ++---
 crypto/dkg_include.h               | 12 +++----
 crypto/dkg_jointfeldman.go         | 26 +++++++-------
 crypto/spock.go                    |  4 +--
 18 files changed, 196 insertions(+), 196 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 65113f873ba..1375f7f0532 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -211,7 +211,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 		return false, nil
 	}
 
-	verif := C.bls_verify((*C.G2)(&pk.point),
+	verif := C.bls_verify((*C.E2)(&pk.point),
 		(*C.uchar)(&s[0]),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)))
@@ -352,7 +352,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 	}
 
 	// membership check in G2
-	if C.G2_check_membership((*C.G2)(&pk.point)) != valid {
+	if C.G2_check_membership((*C.E2)(&pk.point)) != valid {
 		return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group")
 	}
 
@@ -460,7 +460,7 @@ type pubKeyBLSBLS12381 struct {
 	// sure the comparison is performed after an instance is created.
 	//
 	// public key G2 point
-	point pointG2
+	point pointE2
 	// G2 identity check cache
 	isIdentity bool
 }
@@ -468,7 +468,7 @@ type pubKeyBLSBLS12381 struct {
 // newPubKeyBLSBLS12381 creates a new BLS public key with the given point.
 // If no scalar is provided, the function allocates an
 // empty scalar.
-func newPubKeyBLSBLS12381(p *pointG2) *pubKeyBLSBLS12381 {
+func newPubKeyBLSBLS12381(p *pointE2) *pubKeyBLSBLS12381 {
 	if p != nil {
 		key := &pubKeyBLSBLS12381{
 			point: *p,
@@ -546,9 +546,9 @@ func (a *blsBLS12381Algo) init() error {
 
 // This is only a TEST/DEBUG/BENCH function.
 // It returns the hash to G1 point from a slice of 128 bytes
-func mapToG1(data []byte) *pointG1 {
+func mapToG1(data []byte) *pointE1 {
 	l := len(data)
-	var h pointG1
+	var h pointE1
 	C.map_to_G1((*C.ep_st)(&h), (*C.uchar)(&data[0]), (C.int)(l))
 	return &h
 }
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 64ed4fae82c..d08880e4d99 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -815,7 +815,7 @@ static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
 }
 
 // TODO: temp utility function to delete
-ep2_st* E2_blst_to_relic(const G2* x) {
+ep2_st* E2_blst_to_relic(const E2* x) {
     ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); 
     byte* data = (byte*)malloc(G2_SER_BYTES);
     E2_write_bytes(data, x);
@@ -837,7 +837,7 @@ ep2_st* E2_blst_to_relic(const G2* x) {
 
 // TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, 
 //       and update logic with G2 subgroup check?
-BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) {
+BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     // check the length
     if (len != G2_SER_BYTES) {
         return BLST_BAD_ENCODING;
@@ -921,14 +921,14 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) {
 // The serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
 // The code is a modified version of Relic ep2_write_bin 
-void E2_write_bytes(byte *bin, const G2* a) {
+void E2_write_bytes(byte *bin, const E2* a) {
     if (E2_is_infty(a)) {
             // set the infinity bit
             bin[0] = (G2_SERIALIZATION << 7) | (1 << 6);
             memset(bin+1, 0, G2_SER_BYTES-1);
             return;
     }
-    G2 tmp;
+    E2 tmp;
     E2_to_affine(&tmp, a);
 
     Fp2* t_x = &(tmp.x);
@@ -949,35 +949,35 @@ void E2_write_bytes(byte *bin, const G2* a) {
 }
 
 // set p to infinity
-void E2_set_infty(G2* p) {
-    vec_zero(p, sizeof(G2));
+void E2_set_infty(E2* p) {
+    vec_zero(p, sizeof(E2));
 }
 
 // check if `p` is infinity
-bool_t E2_is_infty(const G2* p) {
-    return vec_is_zero(p, sizeof(G2));
+bool_t E2_is_infty(const E2* p) {
+    return vec_is_zero(p, sizeof(E2));
 }
 
 // checks affine point `p` is in E2
-bool_t E2_affine_on_curve(const G2* p) {
+bool_t E2_affine_on_curve(const E2* p) {
     // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, 
     // unlike what the function name means.
     return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p);
 }
 
 // checks p1 == p2
-bool_t E2_is_equal(const G2* p1, const G2* p2) {
+bool_t E2_is_equal(const E2* p1, const E2* p2) {
     // `POINTonE2_is_equal` includes the infinity case
     return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2);
 }
 
 // res = p
-void  E2_copy(G2* res, const G2* p) {
-    vec_copy(res, p, sizeof(G2));
+void  E2_copy(E2* res, const E2* p) {
+    vec_copy(res, p, sizeof(E2));
 }
 
 // converts an E2 point from Jacobian into affine coordinates (z=1)
-void E2_to_affine(G2* res, const G2* p) {
+void E2_to_affine(E2* res, const E2* p) {
     // minor optimization in case coordinates are already affine
     if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) {
         E2_copy(res, p);
@@ -988,25 +988,25 @@ void E2_to_affine(G2* res, const G2* p) {
 }
 
 // generic point addition that must handle doubling and points at infinity
-void E2_add(G2* res, const G2* a, const G2* b) {
+void E2_add(E2* res, const E2* a, const E2* b) {
     POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); 
 }
 
 // Point negation in place.
-// no need for an api of the form E2_neg(G2* res, const G2* a) for now
-static void E2_neg(G2* a) {
+// no need for an api of the form E2_neg(E2* res, const E2* a) for now
+static void E2_neg(E2* a) {
     POINTonE2_cneg((POINTonE2*)a, 1);
 }
 
 // Exponentiation of a generic point `a` in E2, res = expo.a
-void E2_mult(G2* res, const G2* p, const Fr* expo) {
+void E2_mult(E2* res, const E2* p, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
     POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp);
 }
 
 // Exponentiation of a generic point `a` in E2 by a byte exponent.
-void  E2_mult_small_expo(G2* res, const G2* p, const byte expo) {
+void  E2_mult_small_expo(E2* res, const E2* p, const byte expo) {
     pow256 pow_expo; // `pow256` uses bytes little endian.
     pow_expo[0] = expo;
     vec_zero(&pow_expo[1], 32-1);
@@ -1015,14 +1015,14 @@ void  E2_mult_small_expo(G2* res, const G2* p, const byte expo) {
 }
 
 // Exponentiation of generator g2 of G2, res = expo.g2
-void G2_mult_gen(G2* res, const Fr* expo) {
+void G2_mult_gen(E2* res, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
     POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp);
 }
 
 // computes the sum of the G2 array elements y and writes the sum in jointy
-void E2_sum_vector(G2* jointy, const G2* y, const int len){
+void E2_sum_vector(E2* jointy, const E2* y, const int len){
     E2_set_infty(jointy);
     for (int i=0; i<len; i++){
         E2_add(jointy, jointy, &y[i]);
@@ -1037,7 +1037,7 @@ void E2_sum_vector(G2* jointy, const G2* y, const int len){
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications 
 // using the same public keys.
-int bls_spock_verify(const G2* pk1, const byte* sig1, const G2* pk2, const byte* sig2) {  
+int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -1107,7 +1107,7 @@ int bls_spock_verify(const G2* pk1, const byte* sig1, const G2* pk2, const byte*
 
 // Subtracts all G2 array elements `y` from an element `x` and writes the 
 // result in res
-void E2_subtract_vector(G2* res, const G2* x, const G2* y, const int len){
+void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
     E2_sum_vector(res, y, len);
     E2_neg(res);
     E2_add(res, x, res);
@@ -1345,7 +1345,7 @@ void Fp2_print_(char* s, const Fp2* a) {
     printf("\n");
 }
 
-void E2_print_(char* s, const G2* a) {
+void E2_print_(char* s, const E2* a) {
       printf("[%s]:\n", s);
       Fp2_print_(".x", &(a->x));
       Fp2_print_(".y", &(a->y));
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 2297e434c2f..c8c08e8ac0e 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -20,8 +20,8 @@ import (
 
 // Go wrappers around BLST C types
 // Go wrappers around Relic C types
-type pointG1 C.ep_st
-type pointG2 C.G2
+type pointE1 C.ep_st
+type pointE2 C.E2
 type scalar C.Fr
 
 // BLS12-381 related lengths
@@ -29,8 +29,8 @@ var frBytesLen = int(C.get_Fr_BYTES())
 
 // TODO: For now scalars are represented as field elements Fr since all scalars
 // are less than r - check if distinguishing two types in necessary
-//type pointG1_blst C.G1
-//type pointG2_blst C.G2
+//type pointG1_blst C.E1
+//type pointG2_blst C.E2
 
 // context required for the BLS set-up
 type ctx struct {
@@ -79,25 +79,25 @@ func seedRelic(seed []byte) error {
 }
 
 // Exponentiation in G1 (scalar point multiplication)
-func (p *pointG1) scalarMultG1(res *pointG1, expo *scalar) {
+func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) {
 	C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo))
 }
 
 // This function is for TEST only
 // Exponentiation of g1 in G1
-func generatorScalarMultG1(res *pointG1, expo *scalar) {
+func generatorScalarMultG1(res *pointE1, expo *scalar) {
 	C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo))
 }
 
 // This function is for TEST only
 // Generic Exponentiation G1
-func genericScalarMultG1(res *pointG1, expo *scalar) {
+func genericScalarMultG1(res *pointE1, expo *scalar) {
 	C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo))
 }
 
 // Exponentiation of g2 in G2
-func generatorScalarMultG2(res *pointG2, expo *scalar) {
-	C.G2_mult_gen((*C.G2)(res), (*C.Fr)(expo))
+func generatorScalarMultG2(res *pointE2, expo *scalar) {
+	C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo))
 }
 
 // comparison in Fr where r is the group order of G1/G2
@@ -107,8 +107,8 @@ func (x *scalar) equals(other *scalar) bool {
 }
 
 // comparison in G2
-func (p *pointG2) equals(other *pointG2) bool {
-	return C.E2_is_equal((*C.G2)(p), (*C.G2)(other)) != 0
+func (p *pointE2) equals(other *pointE2) bool {
+	return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0
 }
 
 // Comparison to zero in Fr.
@@ -118,8 +118,8 @@ func (x *scalar) isZero() bool {
 }
 
 // Comparison to point at infinity in G2.
-func (p *pointG2) isInfinity() bool {
-	return C.E2_is_infty((*C.G2)(p)) != 0
+func (p *pointE2) isInfinity() bool {
+	return C.E2_is_infty((*C.E2)(p)) != 0
 }
 
 // returns a random element of Fr in input pointer
@@ -165,16 +165,16 @@ func writeScalar(dest []byte, x *scalar) {
 // writePointG2 writes a G2 point in a slice of bytes
 // The slice should be of size PubKeyLenBLSBLS12381 and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func writePointG2(dest []byte, a *pointG2) {
+func writePointG2(dest []byte, a *pointE2) {
 	C.E2_write_bytes((*C.uchar)(&dest[0]),
-		(*C.G2)(a),
+		(*C.E2)(a),
 	)
 }
 
 // writePointG1 writes a G1 point in a slice of bytes
 // The slice should be of size SignatureLenBLSBLS12381 and the serialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func writePointG1(dest []byte, a *pointG1) {
+func writePointG1(dest []byte, a *pointE1) {
 	C.ep_write_bin_compact((*C.uchar)(&dest[0]),
 		(*C.ep_st)(a),
 		(C.int)(signatureLengthBLSBLS12381),
@@ -206,8 +206,8 @@ func readScalarFrStar(a *scalar, src []byte) error {
 // readPointG2 reads a G2 point from a slice of bytes
 // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func readPointG2(a *pointG2, src []byte) error {
-	read := C.E2_read_bytes((*C.G2)(a),
+func readPointG2(a *pointE2, src []byte) error {
+	read := C.E2_read_bytes((*C.E2)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
 
@@ -226,7 +226,7 @@ func readPointG2(a *pointG2, src []byte) error {
 // readPointG1 reads a G1 point from a slice of bytes
 // The slice should be of size SignatureLenBLSBLS12381 and the deserialization will
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func readPointG1(a *pointG1, src []byte) error {
+func readPointG1(a *pointE1, src []byte) error {
 	switch C.ep_read_bin_compact((*C.ep_st)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src))) {
@@ -241,39 +241,39 @@ func readPointG1(a *pointG1, src []byte) error {
 
 // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used
 // in go test files.
-func checkMembershipG1(pt *pointG1) int {
+func checkMembershipG1(pt *pointE1) int {
 	return int(C.check_membership_G1((*C.ep_st)(pt)))
 }
 
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
 // in go test files.
-func checkMembershipG2(pt *pointG2) int {
-	return int(C.G2_check_membership((*C.G2)(pt)))
+func checkMembershipG2(pt *pointE2) int {
+	return int(C.G2_check_membership((*C.E2)(pt)))
 }
 
 // randPointG1 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G1 and stores it in input point.
-func randPointG1(pt *pointG1) {
+func randPointG1(pt *pointE1) {
 	C.ep_rand_G1((*C.ep_st)(pt))
 }
 
 // randPointG1Complement wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E1\G1 and stores it in input point.
-func randPointG1Complement(pt *pointG1) {
+func randPointG1Complement(pt *pointE1) {
 	C.ep_rand_G1complement((*C.ep_st)(pt))
 }
 
 /*
 // randPointG2 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G2 and stores it in input point.
-func randPointG2(pt *pointG2) {
-	C.ep2_rand_G2((*C.G2)(pt))
+func randPointG2(pt *pointE2) {
+	C.ep2_rand_G2((*C.E2)(pt))
 }
 
 // randPointG1Complement wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func randPointG2Complement(pt *pointG2) {
-	C.ep2_rand_G2complement((*C.G2)(pt))
+func randPointG2Complement(pt *pointE2) {
+	C.ep2_rand_G2complement((*C.E2)(pt))
 }
 */
 
@@ -295,7 +295,7 @@ func hashToG1Bytes(data, dst []byte) []byte {
 		(*C.uchar)(&dst[0]), (C.int)(len(dst)))
 
 	// map the hash to G1
-	var point pointG1
+	var point pointE1
 	C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
 
 	// serialize the point
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 471f2bc7bcc..b5477187dcd 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -83,14 +83,14 @@ typedef struct prec_ {
 // TODO: to delete when Relic is removed
 bn_st* Fr_blst_to_relic(const Fr* x);
 Fr*  Fr_relic_to_blst(const bn_st* x);
-ep2_st* E2_blst_to_relic(const G2* x);
+ep2_st* E2_blst_to_relic(const E2* x);
 
 int      get_valid();
 int      get_invalid();
 int      get_Fr_BYTES();
 
 // BLS based SPoCK
-int bls_spock_verify(const G2*, const byte*, const G2*, const byte*);
+int bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 
 // hash to curve functions (functions in bls12381_hashtocurve.c)
 void     map_to_G1(ep_t, const byte*, const int);
@@ -137,24 +137,24 @@ int      bowe_subgroup_check_G1(const ep_t);
 #endif
 
 // E2 and G2 utilities
-void        E2_set_infty(G2* p);
-bool_t      E2_is_infty(const G2*);
-bool_t      E2_affine_on_curve(const G2*);
-bool_t      E2_is_equal(const G2* p1, const G2* p2);
-void        E2_copy(G2*, const G2*);
-void        E2_to_affine(G2*, const G2*);
-BLST_ERROR  E2_read_bytes(G2*, const byte *,  const int); 
-void        E2_write_bytes(byte *, const G2*);
-void        G2_mult_gen(G2*, const Fr*);
-void        E2_mult(G2*, const G2*, const Fr*);
-void        E2_mult_small_expo(G2*, const G2*, const byte);
-void        E2_add(G2* res, const G2* a, const G2* b);
-void        E2_sum_vector(G2*, const G2*, const int);
+void        E2_set_infty(E2* p);
+bool_t      E2_is_infty(const E2*);
+bool_t      E2_affine_on_curve(const E2*);
+bool_t      E2_is_equal(const E2* p1, const E2* p2);
+void        E2_copy(E2*, const E2*);
+void        E2_to_affine(E2*, const E2*);
+BLST_ERROR  E2_read_bytes(E2*, const byte *,  const int); 
+void        E2_write_bytes(byte *, const E2*);
+void        G2_mult_gen(E2*, const Fr*);
+void        E2_mult(E2*, const E2*, const Fr*);
+void        E2_mult_small_expo(E2*, const E2*, const byte);
+void        E2_add(E2* res, const E2* a, const E2* b);
+void        E2_sum_vector(E2*, const E2*, const int);
 
 void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
 
-void     E2_subtract_vector(G2* res, const G2* x, const G2* y, const int len);
-int      G2_check_membership(const G2*);
+void     E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
+int      G2_check_membership(const E2*);
 int      simple_subgroup_check_G2(const ep2_t);
 void     ep2_rand_G2(ep2_t);
 void     ep2_rand_G2complement( ep2_t);
@@ -173,7 +173,7 @@ void     bytes_print_(char*, byte*, int);
 void     Fr_print_(char*, Fr*);
 void     Fp_print_(char*, Fp*);
 void     Fp2_print_(char*, const Fp2*);
-void     E2_print_(char*, const G2*);
+void     E2_print_(char*, const E2*);
 void     fp_print_(char*, fp_t);
 void     bn_print_(char*, bn_st*);
 void     ep_print_(char*, ep_st*);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 337849c78f3..cf0c37d7856 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -23,7 +23,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
-		var res pointG1
+		var res pointE1
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			generatorScalarMultG1(&res, &expo)
@@ -33,7 +33,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 
 	// G1 base point multiplication
 	b.Run("G1 generic", func(b *testing.B) {
-		var res pointG1
+		var res pointE1
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			genericScalarMultG1(&res, &expo)
@@ -43,7 +43,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 
 	// G2 base point multiplication
 	b.Run("G2 gen", func(b *testing.B) {
-		var res pointG2
+		var res pointE2
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			generatorScalarMultG2(&res, &expo)
@@ -60,18 +60,18 @@ func TestMapToG1(t *testing.T) {
 
 	msgs := [][]byte{
 		[]byte{},
-		//[]byte("abc"),
-		//[]byte("abcdef0123456789"),
-		//[]byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"),
-		//[]byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
+		[]byte("abc"),
+		[]byte("abcdef0123456789"),
+		[]byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"),
+		[]byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
 	}
 
 	expectedPointString := []string{
 		"052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1",
-		//"03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903",
-		//"11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98",
-		//"15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488",
-		//"082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe",
+		"03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903",
+		"11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98",
+		"15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488",
+		"082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe",
 	}
 
 	for i, msg := range msgs {
@@ -106,8 +106,8 @@ func TestSubgroupCheck(t *testing.T) {
 	_, _ = rand.Read(seed)
 	_ = seedRelic(seed)
 
-	t.Run("G1", func(t *testing.T) {
-		var p pointG1
+	/*t.Run("G1", func(t *testing.T) {
+		var p pointE1
 		randPointG1(&p) // point in G1
 		res := checkMembershipG1(&p)
 		assert.Equal(t, res, int(valid))
@@ -115,9 +115,9 @@ func TestSubgroupCheck(t *testing.T) {
 		res = checkMembershipG1(&p)
 		assert.Equal(t, res, int(invalid))
 	})
-	/*
+
 		t.Run("G2", func(t *testing.T) {
-			var p pointG2
+			var p pointE2
 			randPointG2(&p) // point in G2
 			res := checkMembershipG2(&p)
 			assert.Equal(t, res, int(valid))
@@ -132,7 +132,7 @@ func TestSubgroupCheck(t *testing.T) {
 func BenchmarkSubgroupCheck(b *testing.B) {
 
 	b.Run("G1", func(b *testing.B) {
-		var p pointG1
+		var p pointE1
 		randPointG1(&p)
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
@@ -142,7 +142,7 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 	})
 	/*
 		b.Run("G2", func(b *testing.B) {
-			var p pointG2
+			var p pointE2
 			randPointG2(&p)
 			b.ResetTimer()
 			for i := 0; i < b.N; i++ {
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index e89bf755e4e..6315e711484 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -40,7 +40,7 @@ int check_membership_G1(const ep_t p){
 // 
 // membership check in G2 is using a scalar multiplication by the group order.
 // TODO: switch to the faster Bowe check 
-int G2_check_membership(const G2* p){
+int G2_check_membership(const E2* p){
 #if MEMBERSHIP_CHECK
     // check p is on curve
     if (!E2_affine_on_curve(p))  // TODO: remove and assume inputs are on curve?
@@ -84,7 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 // and a message data.
 // The signature and public key are assumed to be in G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int len) {     
+static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) {     
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -153,7 +153,7 @@ static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int
 // the membership check is separated to allow optimizing multiple verifications using the same pks
 int bls_verifyPerDistinctMessage(const byte* sig, 
                          const int nb_hashes, const byte* hashes, const uint32_t* len_hashes,
-                         const uint32_t* pks_per_hash, const G2* pks) {  
+                         const uint32_t* pks_per_hash, const E2* pks) {  
 
     int ret = UNDEFINED; // return value
     
@@ -189,7 +189,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 
     // aggregate public keys mapping to the same hash
     offset = 0;
-    G2 tmp;
+    E2 tmp;
     for (int i=1; i < nb_hashes+1; i++) {
         // elemsG2[i] = agg_pk[i]
         E2_sum_vector(&tmp, &pks[offset] , pks_per_hash[i-1]);
@@ -241,7 +241,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 // membership check of pks in G2 is not verified in this function
 // the membership check is separated to allow optimizing multiple verifications using the same pks
 int bls_verifyPerDistinctKey(const byte* sig, 
-                         const int nb_pks, const G2* pks, const uint32_t* hashes_per_pk,
+                         const int nb_pks, const E2* pks, const uint32_t* hashes_per_pk,
                          const byte* hashes, const uint32_t* len_hashes){
 
     int ret = UNDEFINED; // return value
@@ -335,7 +335,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications using the same key.
-int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) {  
+int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {  
     ep_t s;
     ep_new(s);
     
@@ -360,15 +360,15 @@ int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) {
 // The leaves contain the initial signatures and public keys.
 typedef struct st_node { 
     ep_st* sig;
-    G2* pk;  
+    E2* pk;  
     struct st_node* left; 
     struct st_node* right; 
 } node;
 
-static node* new_node(const G2* pk, const ep_st* sig){
+static node* new_node(const E2* pk, const ep_st* sig){
     node* t = (node*) malloc(sizeof(node));
     if (t) {
-        t->pk = (G2*)pk;
+        t->pk = (E2*)pk;
         t->sig = (ep_st*)sig;
         t->right = t->left = NULL;
     }
@@ -395,7 +395,7 @@ static void free_tree(node* root) {
 }
 
 // builds a binary tree of aggregation of signatures and public keys recursively.
-static node* build_tree(const int len, const G2* pks, const ep_st* sigs) {
+static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
     // check if a leaf is reached
     if (len == 1) {
         return new_node(&pks[0], &sigs[0]);  // use the first element of the arrays
@@ -406,7 +406,7 @@ static node* build_tree(const int len, const G2* pks, const ep_st* sigs) {
     int left_len = len - right_len;
 
     // create a new node with new points
-    G2* new_pk = (G2*)malloc(sizeof(G2));
+    E2* new_pk = (E2*)malloc(sizeof(E2));
     if (!new_pk) goto error;
     ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st));
     if (!new_sig) goto error_sig;
@@ -471,14 +471,14 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results,
 //  indices mixup.
 // - optimize the verification by verifying an aggregated signature against an aggregated
 //  public key, and use a recursive verification to find invalid signatures.  
-void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input,
+void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
      const byte* sigs_bytes, const byte* data, const int data_len) {  
     
     // initialize results to undefined
     memset(results, UNDEFINED, sigs_len);
     
     // build the arrays of G1 and G2 elements to verify
-    G2* pks = (G2*) malloc(sigs_len * sizeof(G2));
+    E2* pks = (E2*) malloc(sigs_len * sizeof(E2));
     if (!pks) return;
     ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st));
     if (!sigs) goto out_sigs;
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index 143454dfb25..949e1f6d3b7 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -151,7 +151,7 @@ func testEncodeDecodeSignatureCrossBLST(t *rapid.T) {
 	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte)
 
 	// check decoding results are consistent
-	var pointFlow pointG1
+	var pointFlow pointE1
 	// here we test readPointG1 rather than the simple Signature type alias
 	err := readPointG1(&pointFlow, sigBytes)
 	flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid))
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 32b9f506c8c..f81f2839bcf 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -36,13 +36,13 @@ int      get_pk_len();
 int      get_sk_len();  
 
 void     bls_sign(byte*, const Fr*, const byte*, const int);
-int      bls_verify(const G2*, const byte*, const byte*, const int);
+int      bls_verify(const E2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
-                         const uint32_t*, const G2*);
+                         const uint32_t*, const E2*);
 int      bls_verifyPerDistinctKey(const byte*, 
-                         const int, const G2*, const uint32_t*,
+                         const int, const E2*, const uint32_t*,
                          const byte*, const uint32_t*);
-void     bls_batchVerify(const int, byte*, const G2*,
+void     bls_batchVerify(const int, byte*, const E2*,
             const byte*, const byte*, const int);
 
 #endif
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index d074825e0e2..e6589a60031 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -183,7 +183,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 		return nil, blsAggregateEmptyListError
 	}
 
-	points := make([]pointG2, 0, len(keys))
+	points := make([]pointE2, 0, len(keys))
 	for i, pk := range keys {
 		pkBLS, ok := pk.(*pubKeyBLSBLS12381)
 		if !ok {
@@ -192,8 +192,8 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 		points = append(points, pkBLS.point)
 	}
 
-	var sum pointG2
-	C.E2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]),
+	var sum pointE2
+	C.E2_sum_vector((*C.E2)(&sum), (*C.E2)(&points[0]),
 		(C.int)(len(points)))
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
@@ -207,7 +207,7 @@ func IdentityBLSPublicKey() PublicKey {
 
 	identity := *newPubKeyBLSBLS12381(nil)
 	// set the point to infinity
-	C.E2_set_infty((*C.G2)(&identity.point))
+	C.E2_set_infty((*C.E2)(&identity.point))
 	identity.isIdentity = true
 	return &identity
 }
@@ -233,7 +233,7 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey,
 		return nil, notBLSKeyError
 	}
 
-	pointsToSubtract := make([]pointG2, 0, len(keysToRemove))
+	pointsToSubtract := make([]pointE2, 0, len(keysToRemove))
 	for i, pk := range keysToRemove {
 		pkBLS, ok := pk.(*pubKeyBLSBLS12381)
 		if !ok {
@@ -247,9 +247,9 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey,
 		return aggKey, nil
 	}
 
-	var resultPoint pointG2
-	C.E2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point),
-		(*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract)))
+	var resultPoint pointE2
+	C.E2_subtract_vector((*C.E2)(&resultPoint), (*C.E2)(&aggPKBLS.point),
+		(*C.E2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract)))
 
 	resultKey := newPubKeyBLSBLS12381(&resultPoint)
 	return resultKey, nil
@@ -356,13 +356,13 @@ func VerifyBLSSignatureManyMessages(
 	// The comparison of the maps length minimizes the number of pairings to
 	// compute by aggregating either public keys or the message hashes in
 	// the verification equation.
-	mapPerHash := make(map[string][]pointG2)
-	mapPerPk := make(map[pointG2][][]byte)
+	mapPerHash := make(map[string][]pointE2)
+	mapPerPk := make(map[pointE2][][]byte)
 	// Note: mapPerPk is using a cgo structure as map keys which may lead to 2 equal public keys
 	// being considered distinct. This does not make the verification equation wrong but leads to
 	// computing extra pairings. This case is considered unlikely to happen since a caller is likely
 	// to use the same struct for a same public key.
-	// One way to fix this is to use the public key encoding as the map keys and store the "pointG2"
+	// One way to fix this is to use the public key encoding as the map keys and store the "pointE2"
 	// structure with the map value, which adds more complexity and processing time.
 
 	// fill the 2 maps
@@ -390,7 +390,7 @@ func VerifyBLSSignatureManyMessages(
 		flatDistinctHashes := make([]byte, 0)
 		lenHashes := make([]uint32, 0)
 		pkPerHash := make([]uint32, 0, len(mapPerHash))
-		allPks := make([]pointG2, 0)
+		allPks := make([]pointE2, 0)
 		for hash, pksVal := range mapPerHash {
 			flatDistinctHashes = append(flatDistinctHashes, []byte(hash)...)
 			lenHashes = append(lenHashes, uint32(len([]byte(hash))))
@@ -403,13 +403,13 @@ func VerifyBLSSignatureManyMessages(
 			(*C.uchar)(&flatDistinctHashes[0]),
 			(*C.uint32_t)(&lenHashes[0]),
 			(*C.uint32_t)(&pkPerHash[0]),
-			(*C.G2)(&allPks[0]),
+			(*C.E2)(&allPks[0]),
 		)
 
 	} else {
 		// aggregate hashes per distinct key
 		// using the linearity of the pairing on the G1 variables.
-		distinctPks := make([]pointG2, 0, len(mapPerPk))
+		distinctPks := make([]pointE2, 0, len(mapPerPk))
 		hashPerPk := make([]uint32, 0, len(mapPerPk))
 		flatHashes := make([]byte, 0)
 		lenHashes := make([]uint32, 0)
@@ -425,7 +425,7 @@ func VerifyBLSSignatureManyMessages(
 		verif = C.bls_verifyPerDistinctKey(
 			(*C.uchar)(&s[0]),
 			(C.int)(len(mapPerPk)),
-			(*C.G2)(&distinctPks[0]),
+			(*C.E2)(&distinctPks[0]),
 			(*C.uint32_t)(&hashPerPk[0]),
 			(*C.uchar)(&flatHashes[0]),
 			(*C.uint32_t)(&lenHashes[0]))
@@ -497,9 +497,9 @@ func BatchVerifyBLSSignaturesOneMessage(
 
 	// flatten the shares (required by the C layer)
 	flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs))
-	pkPoints := make([]pointG2, 0, len(pks))
+	pkPoints := make([]pointE2, 0, len(pks))
 
-	getIdentityPoint := func() pointG2 {
+	getIdentityPoint := func() pointE2 {
 		pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true
 		return pk.point
 	}
@@ -530,7 +530,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 	C.bls_batchVerify(
 		(C.int)(len(verifInt)),
 		(*C.uchar)(&verifInt[0]),
-		(*C.G2)(&pkPoints[0]),
+		(*C.E2)(&pkPoints[0]),
 		(*C.uchar)(&flatSigs[0]),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)),
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 3fec93d96f5..72fa421def3 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -555,8 +555,8 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 
 	// the scalars x and G2 points y
 	x := make([]scalar, size)
-	y := make([]pointG2, size)
-	var X0 pointG2
+	y := make([]pointE2, size)
+	var X0 pointE2
 
 	// seed relic
 	if err := seedRelic(seed); err != nil {
@@ -581,7 +581,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	for i := index(1); int(i) <= size; i++ {
 		C.Fr_polynomial_image(
 			(*C.Fr)(&x[i-1]),
-			(*C.G2)(&y[i-1]),
+			(*C.E2)(&y[i-1]),
 			(*C.Fr)(&a[0]), (C.int)(len(a)),
 			(C.uint8_t)(i),
 		)
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 861ba552241..1bc5809d405 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -6,6 +6,6 @@
 #include "bls_include.h"
 
 int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int);
-extern void Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x);
+extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index c480a68d27e..65f552d6fae 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -77,12 +77,13 @@ typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS
 // `Fp` does not need to be exported to cgo.
 typedef vec384 Fp;
 
-// Subroup G1 in E1
-// G1 points are represented in Jacobian coordinates (x,y,z), 
+// curve E_1 (over F_p)
+// E_1 points are represented in Jacobian coordinates (x,y,z), 
 // where x, y, x are elements of F_p (type `Fp`).
-// `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements)
-// `G1` is defined as a struct to be exportable through cgo to the Go layer.
-typedef struct {Fp x,y,z;} G1;
+// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements)
+// `E1` is defined as a struct to be exportable through cgo to the Go layer.
+// `E1` is also used to represent all subgroup G_1 elements. 
+typedef struct {Fp x,y,z;} E1;
 
 // field elements F_p^2
 // F_p^2 elements are represented as a vector of two F_p elements.
@@ -94,11 +95,12 @@ typedef vec384x Fp2;
 #define imag(p)  ((*(p))[1]) 
 
 
-// Subroup G2 in E2
-// G2 points are represented in Jacobian coordinates (x,y,z), 
+// curve E_2 (over F_p^2)
+// E_2 points are represented in Jacobian coordinates (x,y,z), 
 // where x, y, x are elements of F_p (type `Fp`).
-// `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements)
-// `G2` is defined as a struct to be exportable through cgo to the Go layer.
-typedef struct {Fp2 x,y,z;} G2;
+// `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E2 elements)
+// `E2` is defined as a struct to be exportable through cgo to the Go layer.
+// `E2` is also used to represent all subgroup G_2 elements. 
+typedef struct {Fp2 x,y,z;} E2;
 
 #endif
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 0dd4844c08b..aedf5d83164 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -11,7 +11,7 @@
 // r being the order of G1
 // writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer
-void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x){
+void Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x){
     Fr image;
     Fr_polynomial_image(&image, y, a, a_size, x);
     // exports the result
@@ -21,7 +21,7 @@ void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size,
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
 // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255.
 // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL
-void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const byte x){
+void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const byte x){
     Fr_set_zero(image); 
     // convert `x` to Montgomery form
     Fr xR;
@@ -40,7 +40,7 @@ void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 // and stores the point in y
-static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte x){        
+static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte x){        
     E2_set_infty(y);
     for (int i = len_A-1; i >= 0 ; i--) {
         E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo
@@ -51,17 +51,17 @@ static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte
 
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
 // where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
-void G2_polynomial_images(G2 *y, const int len_y, const G2* A, const int len_A) {
+void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A) {
     for (byte i=0; i<len_y; i++) {
         //y[i] = Q(i+1)
-        G2_polynomial_image(y+i , A, len_A, i+1);
+        E2_polynomial_image(y+i , A, len_A, i+1);
     }
 }
 
 // export an array of G2 into an array of bytes by concatenating
 // all serializations of G2 points in order.
 // the array must be of length (len * G2_SER_BYTES).
-void G2_vector_write_bytes(byte* out, const G2* A, const int len) {
+void G2_vector_write_bytes(byte* out, const E2* A, const int len) {
     byte* p = out;
     for (int i=0; i<len; i++){
         E2_write_bytes(p, &A[i]);
@@ -69,9 +69,9 @@ void G2_vector_write_bytes(byte* out, const G2* A, const int len) {
     }
 }
 
-// The function imports an array of G2 from a concatenated array of bytes.
+// The function imports an array of E2 points from a concatenated array of bytes.
 // The bytes array is supposed to be in (len * G2_SER_BYTES) 
-BLST_ERROR G2_vector_read_bytes(G2* A, const byte* src, const int len){
+BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
     byte* p = (byte*) src;
     for (int i=0; i<len; i++){
         int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
@@ -85,8 +85,8 @@ BLST_ERROR G2_vector_read_bytes(G2* A, const byte* src, const int len){
 
 // returns 1 if g2^x = y, where g2 is the generator of G2
 // returns 0 otherwise
-bool_t verify_share(const Fr* x, const G2* y) {
-    G2 tmp;
+bool_t verify_share(const Fr* x, const E2* y) {
+    E2 tmp;
     G2_mult_gen(&tmp, x);
     return E2_is_equal(&tmp, y);
 }
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 659b76354bd..51922814b17 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -35,13 +35,13 @@ type feldmanVSSstate struct {
 	a []scalar
 	// Public vector of the group, the vector size is (t+1)
 	// A_0 is the group public key
-	vA         []pointG2
+	vA         []pointE2
 	vAReceived bool
 	// Private share of the current participant
 	x         scalar
 	xReceived bool
 	// Public keys of the group participants, the vector size is (n)
-	y []pointG2
+	y []pointE2
 	// true if the private share is valid
 	validKey bool
 }
@@ -265,8 +265,8 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error {
 
 	// Generate a polyomial P in Fr[X] of degree t
 	s.a = make([]scalar, s.threshold+1)
-	s.vA = make([]pointG2, s.threshold+1)
-	s.y = make([]pointG2, s.size)
+	s.vA = make([]pointE2, s.threshold+1)
+	s.y = make([]pointE2, s.size)
 	// non-zero a[0] - group private key is not zero
 	if err := randFrStar(&s.a[0]); err != nil {
 		return fmt.Errorf("generating the polynomial failed: %w", err)
@@ -389,7 +389,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 		return
 	}
 	// read the verification vector
-	s.vA = make([]pointG2, s.threshold+1)
+	s.vA = make([]pointE2, s.threshold+1)
 	err := readVerifVector(s.vA, data)
 	if err != nil {
 		s.vAReceived = true
@@ -398,7 +398,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 			fmt.Sprintf("reading the verification vector failed: %s", err))
 	}
 
-	s.y = make([]pointG2, s.size)
+	s.y = make([]pointE2, s.size)
 	s.computePublicKeys()
 
 	s.vAReceived = true
@@ -411,9 +411,9 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 // r being the order of G1
 // P(x) is written in dest, while g2^P(x) is written in y
 // x being a small integer
-func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
+func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) {
 	C.Fr_polynomial_image_export((*C.uchar)(&dest[0]),
-		(*C.G2)(y),
+		(*C.E2)(y),
 		(*C.Fr)(&a[0]), (C.int)(len(a)),
 		(C.uint8_t)(x),
 	)
@@ -421,18 +421,18 @@ func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) {
 
 // writeVerifVector exports a vector A into an array of bytes
 // assuming the array length matches the vector length
-func writeVerifVector(dest []byte, A []pointG2) {
+func writeVerifVector(dest []byte, A []pointE2) {
 	C.G2_vector_write_bytes((*C.uchar)(&dest[0]),
-		(*C.G2)(&A[0]),
+		(*C.E2)(&A[0]),
 		(C.int)(len(A)),
 	)
 }
 
 // readVerifVector imports A vector from an array of bytes,
 // assuming the slice length matches the vector length
-func readVerifVector(A []pointG2, src []byte) error {
-	read := C.G2_vector_read_bytes(
-		(*C.G2)(&A[0]),
+func readVerifVector(A []pointE2, src []byte) error {
+	read := C.E2_vector_read_bytes(
+		(*C.E2)(&A[0]),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(A)))
 	if int(read) == blst_valid {
@@ -446,7 +446,7 @@ func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
 	return C.verify_share(
 		(*C.Fr)(&s.x),
-		(*C.G2)(&s.y[s.myIndex])) != 0
+		(*C.E2)(&s.y[s.myIndex])) != 0
 }
 
 // computePublicKeys extracts the participants public keys from the verification vector
@@ -454,8 +454,8 @@ func (s *feldmanVSSstate) verifyShare() bool {
 //
 //	Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 func (s *feldmanVSSstate) computePublicKeys() {
-	C.G2_polynomial_images(
-		(*C.G2)(&s.y[0]), (C.int)(len(s.y)),
-		(*C.G2)(&s.vA[0]), (C.int)(len(s.vA)),
+	C.E2_polynomial_images(
+		(*C.E2)(&s.y[0]), (C.int)(len(s.y)),
+		(*C.E2)(&s.vA[0]), (C.int)(len(s.vA)),
 	)
 }
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 0bf5ad6445c..38b3667ffae 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -446,7 +446,7 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) {
 		return
 	}
 	// read the verification vector
-	s.vA = make([]pointG2, s.threshold+1)
+	s.vA = make([]pointE2, s.threshold+1)
 	err := readVerifVector(s.vA, data)
 	if err != nil {
 		s.disqualified = true
@@ -455,10 +455,8 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) {
 		return
 	}
 
-	s.y = make([]pointG2, s.size)
+	s.y = make([]pointE2, s.size)
 	// compute all public keys
-	// TODO: could optimize to compute this step only to check complaint answers,
-	// and then for inputs from qualified leaders (at End call)
 	s.computePublicKeys()
 
 	// check the (already) registered complaints
@@ -510,7 +508,7 @@ func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) boo
 	// check y[complainer] == share.G2
 	return C.verify_share(
 		(*C.Fr)(&c.answer),
-		(*C.G2)(&s.y[complainer])) == 0
+		(*C.E2)(&s.y[complainer])) == 0
 }
 
 // data = |complainee|
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 0aaf1296b53..8a1248cacd9 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,11 +5,11 @@
 
 #include "bls12381_utils.h"
 
-void        Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x);
-void        Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x);
-void        G2_polynomial_images(G2* y, const int len_y, const G2* A, const int len_A);
-void        G2_vector_write_bytes(byte* out, const G2* A, const int len);
-BLST_ERROR  G2_vector_read_bytes(G2* A, const byte* src, const int len);
-bool_t      verify_share(const Fr* x, const G2* y);
+void        Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x);
+void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
+void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A);
+void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
+BLST_ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
+bool_t      verify_share(const Fr* x, const E2* y);
 
 #endif
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 21d06e1470e..b15c421dde6 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -45,11 +45,11 @@ type JointFeldmanState struct {
 	// feldmanVSSQualState parallel states
 	fvss []feldmanVSSQualState
 	// is the group public key
-	jointPublicKey pointG2
+	jointPublicKey pointE2
 	// Private share of the current participant
 	jointx scalar
 	// Public keys of the group participants, the vector size is (n)
-	jointy []pointG2
+	jointy []pointE2
 }
 
 // NewJointFeldman creates a new instance of a Joint Feldman protocol.
@@ -298,7 +298,7 @@ func (s *JointFeldmanState) ForceDisqualify(participant int) error {
 }
 
 // sum up the 3 type of keys from all qualified dealers to end the protocol
-func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2, []pointG2) {
+func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointE2, []pointE2) {
 	qualifiedx, qualifiedPubKey, qualifiedy := s.getQualifiedKeys(qualified)
 
 	// sum up x
@@ -306,25 +306,25 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2
 	C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]),
 		(C.int)(qualified))
 	// sum up Y
-	var jointPublicKey pointG2
-	C.E2_sum_vector((*C.G2)(&jointPublicKey),
-		(*C.G2)(&qualifiedPubKey[0]), (C.int)(qualified))
+	var jointPublicKey pointE2
+	C.E2_sum_vector((*C.E2)(&jointPublicKey),
+		(*C.E2)(&qualifiedPubKey[0]), (C.int)(qualified))
 	// sum up []y
-	jointy := make([]pointG2, s.size)
+	jointy := make([]pointE2, s.size)
 	for i := 0; i < s.size; i++ {
-		C.E2_sum_vector((*C.G2)(&jointy[i]),
-			(*C.G2)(&qualifiedy[i][0]), (C.int)(qualified))
+		C.E2_sum_vector((*C.E2)(&jointy[i]),
+			(*C.E2)(&qualifiedy[i][0]), (C.int)(qualified))
 	}
 	return &jointx, &jointPublicKey, jointy
 }
 
 // get the 3 type of keys from all qualified dealers
-func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2, [][]pointG2) {
+func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointE2, [][]pointE2) {
 	qualifiedx := make([]scalar, 0, qualified)
-	qualifiedPubKey := make([]pointG2, 0, qualified)
-	qualifiedy := make([][]pointG2, s.size)
+	qualifiedPubKey := make([]pointE2, 0, qualified)
+	qualifiedy := make([][]pointE2, s.size)
 	for i := 0; i < s.size; i++ {
-		qualifiedy[i] = make([]pointG2, 0, qualified)
+		qualifiedy[i] = make([]pointE2, 0, qualified)
 	}
 
 	for i := 0; i < s.size; i++ {
diff --git a/crypto/spock.go b/crypto/spock.go
index ce80a7f2275..4fbd974c27f 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -90,9 +90,9 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 	}
 
 	// verify the spock proof using the secret data
-	verif := C.bls_spock_verify((*C.G2)(&blsPk1.point),
+	verif := C.bls_spock_verify((*C.E2)(&blsPk1.point),
 		(*C.uchar)(&proof1[0]),
-		(*C.G2)(&blsPk2.point),
+		(*C.E2)(&blsPk2.point),
 		(*C.uchar)(&proof2[0]))
 
 	switch verif {

From 5cbfd085c8e090ba4836fc1e8e33b58e09856f70 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 17 Apr 2023 15:22:54 -0600
Subject: [PATCH 035/200] rename some G1/G2 functions to E1/E2

---
 crypto/bls.go                |  2 +-
 crypto/bls12381_utils.c      |  8 ++++----
 crypto/bls12381_utils.go     | 20 +++++++++++---------
 crypto/bls12381_utils.h      |  6 +++---
 crypto/bls_core.c            | 14 +++++++-------
 crypto/bls_crossBLST_test.go |  4 ++--
 6 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 1375f7f0532..d45ea7f3aeb 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -346,7 +346,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 			pubKeyLengthBLSBLS12381, len(publicKeyBytes))
 	}
 	var pk pubKeyBLSBLS12381
-	err := readPointG2(&pk.point, publicKeyBytes)
+	err := readPointE2(&pk.point, publicKeyBytes)
 	if err != nil {
 		return nil, fmt.Errorf("decode public key failed: %w", err)
 	}
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index d08880e4d99..9b91e8e0ebd 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1048,7 +1048,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         return read_ret;
 
     // check s1 is in G1
-    if (check_membership_G1(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
+    if (G1_check_membership(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
         return INVALID;
 
     // elemsG1[1] = s2
@@ -1058,7 +1058,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         return read_ret;
 
     // check s2 in G1
-    if (check_membership_G1(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
+    if (G1_check_membership(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
         return INVALID; 
 
     // elemsG2[1] = pk1
@@ -1160,7 +1160,7 @@ int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) {
 
 // uses a simple scalar multiplication by G1's order
 // to check whether a point on the curve E1 is in G1.
-int simple_subgroup_check_G1(const ep_t p){
+int G1_simple_subgroup_check(const ep_t p){
     ep_t inf;
     ep_new(inf);
     // check p^order == infinity
@@ -1176,7 +1176,7 @@ int simple_subgroup_check_G1(const ep_t p){
 
 // uses a simple scalar multiplication by G1's order
 // to check whether a point on the curve E2 is in G2.
-int simple_subgroup_check_G2(const ep2_t p){
+int G2_simple_subgroup_check(const ep2_t p){
     ep2_t inf;
     ep2_new(inf);
     // check p^order == infinity
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index c8c08e8ac0e..59776fcec5b 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -203,10 +203,11 @@ func readScalarFrStar(a *scalar, src []byte) error {
 
 }
 
-// readPointG2 reads a G2 point from a slice of bytes
-// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will
-// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func readPointG2(a *pointE2, src []byte) error {
+// readPointE2 reads a E2 point from a slice of bytes
+// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization
+// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
+// No G2 membership check is performed.
+func readPointE2(a *pointE2, src []byte) error {
 	read := C.E2_read_bytes((*C.E2)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
@@ -223,10 +224,11 @@ func readPointG2(a *pointE2, src []byte) error {
 	}
 }
 
-// readPointG1 reads a G1 point from a slice of bytes
-// The slice should be of size SignatureLenBLSBLS12381 and the deserialization will
-// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func readPointG1(a *pointE1, src []byte) error {
+// readPointE1 reads a E1 point from a slice of bytes
+// The slice should be of size SignatureLenBLSBLS12381 and the deserialization
+// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
+// No G1 membership check is performed.
+func readPointE1(a *pointE1, src []byte) error {
 	switch C.ep_read_bin_compact((*C.ep_st)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src))) {
@@ -242,7 +244,7 @@ func readPointG1(a *pointE1, src []byte) error {
 // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used
 // in go test files.
 func checkMembershipG1(pt *pointE1) int {
-	return int(C.check_membership_G1((*C.ep_st)(pt)))
+	return int(C.G1_check_membership((*C.ep_st)(pt)))
 }
 
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index b5477187dcd..01f68610603 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -128,8 +128,8 @@ void     ep_mult_generic_bench(ep_t, const Fr*);
 void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
-int      check_membership_G1(const ep_t);
-int      simple_subgroup_check_G1(const ep_t);
+int      G1_check_membership(const ep_t);
+int      G1_simple_subgroup_check(const ep_t);
 void     ep_rand_G1(ep_t);
 void     ep_rand_G1complement( ep_t);
 #if  (MEMBERSHIP_CHECK_G1 == BOWE)
@@ -155,7 +155,7 @@ void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo);
 
 void     E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
 int      G2_check_membership(const E2*);
-int      simple_subgroup_check_G2(const ep2_t);
+int      G2_simple_subgroup_check(const ep2_t);
 void     ep2_rand_G2(ep2_t);
 void     ep2_rand_G2complement( ep2_t);
 
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 6315e711484..eae1382e6a1 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -21,10 +21,10 @@ int get_sk_len() {
 
 // Checks if input point p is in the subgroup G1. 
 // The function assumes the input is known to be on the curve E1.
-int check_membership_G1(const ep_t p){
+int G1_check_membership(const ep_t p){
 #if MEMBERSHIP_CHECK
     #if MEMBERSHIP_CHECK_G1 == EXP_ORDER
-    return simple_subgroup_check_G1(p);
+    return G1_simple_subgroup_check(p);
     #elif MEMBERSHIP_CHECK_G1 == BOWE
     // section 3.2 from https://eprint.iacr.org/2019/814.pdf
     return bowe_subgroup_check_G1(p);
@@ -47,7 +47,7 @@ int G2_check_membership(const E2* p){
         return INVALID;
     // check p is in G2
     #if MEMBERSHIP_CHECK_G2 == EXP_ORDER
-    return simple_subgroup_check_G2(p);
+    return G2_simple_subgroup_check(p);
     #elif MEMBERSHIP_CHECK_G2 == BOWE
     // TODO: implement Bowe's check
     return UNDEFINED;
@@ -172,7 +172,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     if (ret != RLC_OK) goto out;
 
     // check s is in G1
-    ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
+    ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
     if (ret != VALID) goto out;
 
     // elemsG2[0] = -g2
@@ -260,7 +260,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
     if (ret != RLC_OK) goto out;
 
     // check s in G1
-    ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
+    ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
     if (ret != VALID) goto out;
 
     // elemsG2[0] = -g2
@@ -346,7 +346,7 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {
     }
 
     // check s is in G1
-    if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
+    if (G1_check_membership(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
         return INVALID;
     }
     
@@ -495,7 +495,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
         int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) {
+        if (read_ret != RLC_OK || G1_check_membership(&sigs[i]) != VALID) {
             if (read_ret == UNDEFINED) {// unexpected error case 
                 goto out;
             };
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index 949e1f6d3b7..e9f9a902d0b 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -152,8 +152,8 @@ func testEncodeDecodeSignatureCrossBLST(t *rapid.T) {
 
 	// check decoding results are consistent
 	var pointFlow pointE1
-	// here we test readPointG1 rather than the simple Signature type alias
-	err := readPointG1(&pointFlow, sigBytes)
+	// here we test readPointE1 rather than the simple Signature type alias
+	err := readPointE1(&pointFlow, sigBytes)
 	flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid))
 
 	var pointBLST blst.P1Affine

From f1045056e2f979b7f1f614269fa867c7d8b9db12 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 17 Apr 2023 18:54:46 -0600
Subject: [PATCH 036/200] various renaming in DKG functions

---
 crypto/bls_thresholdsign.go        |  4 ++--
 crypto/bls_thresholdsign_core.c    | 12 ++++++------
 crypto/bls_thresholdsign_include.h |  2 +-
 crypto/dkg_core.c                  | 15 ++++++++-------
 crypto/dkg_feldmanvss.go           |  4 ++--
 crypto/dkg_feldmanvssq.go          |  2 +-
 crypto/dkg_include.h               |  4 ++--
 7 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 72fa421def3..5ff2e3a4550 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -413,7 +413,7 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 	}
 
 	// Lagrange Interpolate at point 0
-	result := C.G1_lagrangeInterpolateAtZero_serialized(
+	result := C.E1_lagrange_interpolate_at_zero_write(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&shares[0]),
 		(*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1))
@@ -501,7 +501,7 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 
 	thresholdSignature := make([]byte, signatureLengthBLSBLS12381)
 	// Lagrange Interpolate at point 0
-	if C.G1_lagrangeInterpolateAtZero_serialized(
+	if C.E1_lagrange_interpolate_at_zero_write(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&flatShares[0]),
 		(*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1),
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 75542763f6a..96d07f2a42e 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -9,7 +9,7 @@
 // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)]
 // and stores it in `res`, where t is the degree of the polynomial P.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t indices[], const int len){
+static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indices[], const int len){
 
     // coefficient is computed as N * D^(-1)
     Fr numerator;  // eventually would represent N*R^k  
@@ -65,7 +65,7 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
 // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) {
+static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) {
     // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1 
     // where A_i = g1 ^ a_i
 
@@ -79,7 +79,7 @@ static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], cons
 
     Fr fr_lagr_coef;
     for (int i=0; i < len; i++) {
-        Fr_lagrangeCoefficientAtZero(&fr_lagr_coef, i, indices, len);
+        Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len);
         bn_st* bn_lagr_coef = Fr_blst_to_relic(&fr_lagr_coef);
         ep_mul_lwnaf(mult, &shares[i], bn_lagr_coef);
         free(bn_lagr_coef);
@@ -90,9 +90,9 @@ static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], cons
 }
 
 // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] 
-// and their G1 concatenated serializations [shares(1)..shares(t+1)], and stores the serialized result in `dest`.
+// and writes their E1 concatenated serializations [shares(1)..shares(t+1)] in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-int G1_lagrangeInterpolateAtZero_serialized(byte* dest, const byte* shares, const uint8_t indices[], const int len) {
+int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) {
     int read_ret;
     // temp variables
     ep_t res;
@@ -108,7 +108,7 @@ int G1_lagrangeInterpolateAtZero_serialized(byte* dest, const byte* shares, cons
     // G1 interpolation at 0
     // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
     // where A_i = g1 ^ a_i
-    G1_lagrangeInterpolateAtZero(res, ep_shares, indices, len);
+    E1_lagrange_interpolate_at_zero(res, ep_shares, indices, len);
 
     // export the result
     ep_write_bin_compact(dest, res, SIGNATURE_LEN);
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 1bc5809d405..e39e4a06887 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -5,7 +5,7 @@
 
 #include "bls_include.h"
 
-int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int);
+int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
 extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index aedf5d83164..9b51c89d32b 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -8,10 +8,10 @@
 #define T_max  ((N_max-1)/2)
 
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r)
-// r being the order of G1
-// writes P(x) in out and P(x).g2 in y if y is non NULL
-// x being a small integer
-void Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x){
+// r being the order of G1, 
+// and writes P(x) in out and P(x).g2 in y if y is non NULL
+// x being a small integer (byte).
+void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x){
     Fr image;
     Fr_polynomial_image(&image, y, a, a_size, x);
     // exports the result
@@ -83,9 +83,10 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
     return BLST_SUCCESS;
 }
 
-// returns 1 if g2^x = y, where g2 is the generator of G2
-// returns 0 otherwise
-bool_t verify_share(const Fr* x, const E2* y) {
+// checks the discrete log relationship in G2.
+// - returns 1 if g2^x = y, where g2 is the generator of G2
+// - returns 0 otherwise.
+bool_t G2_check_log(const Fr* x, const E2* y) {
     E2 tmp;
     G2_mult_gen(&tmp, x);
     return E2_is_equal(&tmp, y);
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 51922814b17..fbc4e5eaf68 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -412,7 +412,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 // P(x) is written in dest, while g2^P(x) is written in y
 // x being a small integer
 func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) {
-	C.Fr_polynomial_image_export((*C.uchar)(&dest[0]),
+	C.Fr_polynomial_image_write((*C.uchar)(&dest[0]),
 		(*C.E2)(y),
 		(*C.Fr)(&a[0]), (C.int)(len(a)),
 		(C.uint8_t)(x),
@@ -444,7 +444,7 @@ func readVerifVector(A []pointE2, src []byte) error {
 
 func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
-	return C.verify_share(
+	return C.G2_check_log(
 		(*C.Fr)(&s.x),
 		(*C.E2)(&s.y[s.myIndex])) != 0
 }
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 38b3667ffae..ae929aa49ff 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -506,7 +506,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 // - true if the complaint answer is not correct
 func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool {
 	// check y[complainer] == share.G2
-	return C.verify_share(
+	return C.G2_check_log(
 		(*C.Fr)(&c.answer),
 		(*C.E2)(&s.y[complainer])) == 0
 }
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 8a1248cacd9..c467a43714b 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,11 +5,11 @@
 
 #include "bls12381_utils.h"
 
-void        Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x);
+void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x);
 void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A);
 void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
 BLST_ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
-bool_t      verify_share(const Fr* x, const E2* y);
+bool_t      G2_check_log(const Fr* x, const E2* y);
 
 #endif

From 6f044c26166a2b2a6550687f52e9f3d735702a4a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 18 Apr 2023 17:49:47 -0600
Subject: [PATCH 037/200] renaming and add Fr_generate_poly function

---
 crypto/bls.go                      |  2 +-
 crypto/bls12381_utils.c            |  4 +-
 crypto/bls12381_utils.go           | 10 ++---
 crypto/bls12381_utils.h            |  2 +-
 crypto/bls_thresholdsign.go        |  4 --
 crypto/bls_thresholdsign_include.h |  3 +-
 crypto/blst_include.h              |  1 +
 crypto/blst_src/blst_src.c         |  3 +-
 crypto/dkg_core.c                  | 62 ++++++++++++++++++++++++++++--
 crypto/dkg_feldmanvss.go           |  5 ---
 crypto/dkg_include.h               |  1 +
 11 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index d45ea7f3aeb..34281e0aab5 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -294,7 +294,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) {
 		defer overwrite(okm) // overwrite okm
 
 		// map the bytes to a private key : SK = OS2IP(OKM) mod r
-		isZero := mapToZr(&sk.scalar, okm)
+		isZero := mapToFr(&sk.scalar, okm)
 		if !isZero {
 			return sk, nil
 		}
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 9b91e8e0ebd..9518320d051 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -123,6 +123,8 @@ prec_st* init_precomputed_data_BLS12_381() {
     return bls_prec;
 }
 
+// ------------------- Utilities
+
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
@@ -372,7 +374,7 @@ static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
 // Reads a scalar from an array and maps it to Fr using modular reduction.
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
-bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
+bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
     vec256_from_be_bytes(a, bin, len);
     return Fr_is_zero(a);
 }
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 59776fcec5b..636ddbc0824 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -129,7 +129,7 @@ func randFr(x *scalar) error {
 	if err != nil {
 		return errors.New("internal rng failed")
 	}
-	_ = mapToZr(x, bytes)
+	_ = mapToFr(x, bytes)
 	return nil
 }
 
@@ -142,19 +142,19 @@ func randFrStar(x *scalar) error {
 		if err != nil {
 			return errors.New("internal rng failed")
 		}
-		isZero = mapToZr(x, bytes)
+		isZero = mapToFr(x, bytes)
 	}
 	return nil
 }
 
-// mapToZr reads a scalar from a slice of bytes and maps it to Zr.
+// mapToFr reads a scalar from a slice of bytes and maps it to Zr.
 // The resulting element `k` therefore satisfies 0 <= k < r.
 // It returns true if scalar is zero and false otherwise.
-func mapToZr(x *scalar, src []byte) bool {
+func mapToFr(x *scalar, src []byte) bool {
 	isZero := C.map_bytes_to_Fr((*C.Fr)(x),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
-	return bool(isZero)
+	return isZero != (C.ulonglong)(0)
 }
 
 // writeScalar writes a scalar in a slice of bytes
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 01f68610603..3e4c84ed43f 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -116,7 +116,7 @@ void        Fr_inv_exp_montg(Fr *res, const Fr *a);
 BLST_ERROR  Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
 BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
 void        Fr_write_bytes(uint8_t *bin, const Fr* a);
-bool        map_bytes_to_Fr(Fr*, const uint8_t*, int);
+bool_t      map_bytes_to_Fr(Fr*, const uint8_t*, int);
 
 // Fp utilities
 
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 5ff2e3a4550..008fc1d7ae8 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -558,10 +558,6 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	y := make([]pointE2, size)
 	var X0 pointE2
 
-	// seed relic
-	if err := seedRelic(seed); err != nil {
-		return nil, nil, nil, fmt.Errorf("seeding relic failed: %w", err)
-	}
 	// Generate a polynomial P in Fr[X] of degree t
 	a := make([]scalar, threshold+1)
 	if err := randFrStar(&a[0]); err != nil { // non-identity key
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index e39e4a06887..a10f482cceb 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -5,7 +5,8 @@
 
 #include "bls_include.h"
 
-int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
+int         E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
 extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
+extern void Fr_generate_polynomial(Fr* a);
 
 #endif
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 65f552d6fae..64b8e4562b8 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -11,6 +11,7 @@
 #include "fields.h"
 #include "consts.h"
 #include "errors.h"
+#include "sha256.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c
index 4b0732e06e4..dc2d2c40a4e 100644
--- a/crypto/blst_src/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -1,6 +1,7 @@
 // +build relic
 
-#include "keygen.c"
+// keygen.c is not included as it is imported by dkg_core and is not needed
+// by bls12_381_utils
 #include "hash_to_field.c"
 #include "e1.c"
 #include "map_to_g1.c"
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 9b51c89d32b..3a8356bbbf3 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -3,11 +3,65 @@
 #include "dkg_include.h"
 
 
-#define N_max 250
-#define N_bits_max 8  // log(250)  
-#define T_max  ((N_max-1)/2)
+// HKDF is used to extract and expand entropy
+// `hkdf_ctx` holds the context of a HKDF instance
+#include "keygen.c" // imported here in order to import BLST's `HMAC_SHA256_CTX`
+typedef struct {
+        HMAC_SHA256_CTX hmac_ctx;   // HMAC context
+        byte prk[32];               // pseudo-random key used by HKDF 
+} hkdf_ctx;
 
-// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r)
+// instanciate a HKDF to extract entropy from `ikm`. 
+static hkdf_ctx* get_hkdf_ctx(const byte* ikm, const int ikm_len) {
+    hkdf_ctx* ctx = (hkdf_ctx*) malloc(sizeof(hkdf_ctx));
+    HKDF_Extract(ctx->prk, NULL, 0, ikm, ikm_len, 0, &ctx->hmac_ctx);
+    return ctx;
+}
+
+// expand entropy from a HKDF instance
+static void expand_entropy(byte* dest, const int len, hkdf_ctx* ctx) {
+    HKDF_Expand(dest, len, ctx->prk, NULL, 0, 0, &ctx->hmac_ctx);
+}
+
+// generate a polynomial P = a_0 + a_1*x + .. + a_n x^n in F_r
+// where degree `n` is input `degree` (higher degree monomial in non-zero).
+// P also guarantees `a_0` is non zero (for single dealer BLS-DKGs, this insures
+// protocol public key output is not identity).
+//
+// `seed` is used as the source of entropy of the secret polynomial. 
+// `seed_len` is required to be at least 16, and it is not checked in the function.
+void  Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len) {
+    // use HKDF to expand `seed` into the needed bytes
+    hkdf_ctx* ctx = get_hkdf_ctx(seed, seed_len);
+    // bytes of each coefficient a_i
+    // use extra 128 bits to reduce the modular reduction bias (128 is half of Fr_BYTES)
+    const int coef_bytes_len = Fr_BYTES + Fr_BYTES/2;
+    byte coef_bytes[coef_bytes_len];
+
+    // generate a_0 in F_r*
+    bool_t is_zero = 1;
+    while (is_zero) {
+        expand_entropy(coef_bytes, coef_bytes_len, ctx);
+        is_zero = map_bytes_to_Fr(&a[0], coef_bytes, coef_bytes_len);
+    }
+    
+    if (degree > 1) {
+        // genarate a_i on F_r, for 0<i<degree
+        for (int i=0; i < degree; i++) {
+            expand_entropy(coef_bytes, coef_bytes_len, ctx);
+            map_bytes_to_Fr(&a[i], coef_bytes, coef_bytes_len);
+        }
+        // generate a_degree in F_r* to enforce P's degree
+        is_zero = 1;
+        while (is_zero) {
+            expand_entropy(coef_bytes, coef_bytes_len, ctx);
+            is_zero = map_bytes_to_Fr(&a[degree], coef_bytes, coef_bytes_len);
+        }
+    }
+    free(ctx);
+}
+
+// computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r
 // r being the order of G1, 
 // and writes P(x) in out and P(x).g2 in y if y is non NULL
 // x being a small integer (byte).
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index fbc4e5eaf68..2f686fa4640 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -258,11 +258,6 @@ func (s *feldmanVSSstate) ForceDisqualify(participant int) error {
 // generateShares is used by the dealer to generate secret polynomial from the input seed
 // and derive all private shares and public data.
 func (s *feldmanVSSstate) generateShares(seed []byte) error {
-	err := seedRelic(seed)
-	if err != nil {
-		return fmt.Errorf("generating shares failed: %w", err)
-	}
-
 	// Generate a polyomial P in Fr[X] of degree t
 	s.a = make([]scalar, s.threshold+1)
 	s.vA = make([]pointE2, s.threshold+1)
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index c467a43714b..7278f92bc94 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,6 +5,7 @@
 
 #include "bls12381_utils.h"
 
+void        Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len);
 void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x);
 void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A);

From acb053f452f518ae4a5c448b7b176ae25a81f10a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 18 Apr 2023 18:00:57 -0600
Subject: [PATCH 038/200] update C polynomial headers to use degree

---
 crypto/bls_thresholdsign.go |  2 +-
 crypto/dkg_core.c           | 23 +++++++++++------------
 crypto/dkg_feldmanvss.go    |  4 ++--
 crypto/dkg_include.h        |  6 +++---
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 008fc1d7ae8..8f28d048b63 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -578,7 +578,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 		C.Fr_polynomial_image(
 			(*C.Fr)(&x[i-1]),
 			(*C.E2)(&y[i-1]),
-			(*C.Fr)(&a[0]), (C.int)(len(a)),
+			(*C.Fr)(&a[0]), (C.int)(len(a)-1),
 			(C.uint8_t)(i),
 		)
 	}
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 3a8356bbbf3..a8c0c976382 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -62,27 +62,26 @@ void  Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const in
 }
 
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r
-// r being the order of G1, 
-// and writes P(x) in out and P(x).g2 in y if y is non NULL
-// x being a small integer (byte).
-void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x){
+// where `x` is a small integer (byte) and `degree` is P's degree n.
+// P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL.
+void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int degree, const byte x){
     Fr image;
-    Fr_polynomial_image(&image, y, a, a_size, x);
+    Fr_polynomial_image(&image, y, a, degree, x);
     // exports the result
     Fr_write_bytes(out, &image);
 }
 
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
-// a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255.
+// a_i are all in Fr, `degree` is P's degree, x is a small integer less than 255.
 // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL
-void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const byte x){
+void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int degree, const byte x){
     Fr_set_zero(image); 
     // convert `x` to Montgomery form
     Fr xR;
     Fr_set_limb(&xR, (limb_t)x);
     Fr_to_montg(&xR, &xR);
 
-    for (int i = a_size-1; i >= 0; i--) {
+    for (int i = degree; i >= 0; i--) {
         Fr_mul_montg(image, image, &xR); 
         Fr_add(image, image, &a[i]); // image is in normal form
     }
@@ -94,9 +93,9 @@ void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 // and stores the point in y
-static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte x){        
+static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){        
     E2_set_infty(y);
-    for (int i = len_A-1; i >= 0 ; i--) {
+    for (int i = degree; i >= 0 ; i--) {
         E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo
         E2_add(y, y, &A[i]);
     }
@@ -105,10 +104,10 @@ static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte
 
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
 // where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
-void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A) {
+void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int degree) {
     for (byte i=0; i<len_y; i++) {
         //y[i] = Q(i+1)
-        E2_polynomial_image(y+i , A, len_A, i+1);
+        E2_polynomial_image(y+i , A, degree, i+1);
     }
 }
 
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 2f686fa4640..61d2e565d63 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -409,7 +409,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) {
 	C.Fr_polynomial_image_write((*C.uchar)(&dest[0]),
 		(*C.E2)(y),
-		(*C.Fr)(&a[0]), (C.int)(len(a)),
+		(*C.Fr)(&a[0]), (C.int)(len(a)-1),
 		(C.uint8_t)(x),
 	)
 }
@@ -451,6 +451,6 @@ func (s *feldmanVSSstate) verifyShare() bool {
 func (s *feldmanVSSstate) computePublicKeys() {
 	C.E2_polynomial_images(
 		(*C.E2)(&s.y[0]), (C.int)(len(s.y)),
-		(*C.E2)(&s.vA[0]), (C.int)(len(s.vA)),
+		(*C.E2)(&s.vA[0]), (C.int)(len(s.vA)-1),
 	)
 }
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 7278f92bc94..6e5c9241638 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -6,9 +6,9 @@
 #include "bls12381_utils.h"
 
 void        Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len);
-void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x);
-void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
-void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A);
+void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int deg, const byte x);
+void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, const byte x);
+void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg);
 void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
 BLST_ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
 bool_t      G2_check_log(const Fr* x, const E2* y);

From fdf18bea25d4badc6bee4b39b7b31307d52d21a2 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 19 Apr 2023 14:08:55 -0600
Subject: [PATCH 039/200] use pseudo-random randFr, randFrStar and FrPolynomial

---
 crypto/bls12381_utils.go           |  33 ++++-----
 crypto/bls12381_utils_test.go      |  10 ++-
 crypto/bls_thresholdsign.go        |  19 ++---
 crypto/bls_thresholdsign_include.h |   5 +-
 crypto/bls_thresholdsign_test.go   | 109 +++++++++++++++--------------
 crypto/dkg_core.c                  |  59 ----------------
 crypto/dkg_feldmanvss.go           |  77 ++++++++++++++------
 crypto/dkg_include.h               |   1 -
 8 files changed, 142 insertions(+), 171 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 636ddbc0824..0756f09472e 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -14,8 +14,9 @@ package crypto
 // #include "bls12381_utils.h"
 import "C"
 import (
-	"crypto/rand"
 	"errors"
+
+	"github.com/onflow/flow-go/crypto/random"
 )
 
 // Go wrappers around BLST C types
@@ -122,29 +123,25 @@ func (p *pointE2) isInfinity() bool {
 	return C.E2_is_infty((*C.E2)(p)) != 0
 }
 
-// returns a random element of Fr in input pointer
-func randFr(x *scalar) error {
+// generates a random element in F_r using input random source,
+// and saves the random in `x`.
+// returns `true` if generated element is zero.
+func randFr(x *scalar, rand random.Rand) bool {
+	// use extra 128 bits to reduce the modular reduction bias
 	bytes := make([]byte, frBytesLen+securityBits/8)
-	_, err := rand.Read(bytes) // checking one output is enough
-	if err != nil {
-		return errors.New("internal rng failed")
-	}
-	_ = mapToFr(x, bytes)
-	return nil
+	rand.Read(bytes) // checking one output is enough
+	// modular reduction
+	return mapToFr(x, bytes)
 }
 
-// writes a random element of Fr* in input pointer
-func randFrStar(x *scalar) error {
-	bytes := make([]byte, frBytesLen+securityBits/8)
+// generates a random element in F_r* using input random source,
+// and saves the random in `x`.
+func randFrStar(x *scalar, rand random.Rand) {
 	isZero := true
+	// exteremely unlikely this loop runs more than once
 	for isZero {
-		_, err := rand.Read(bytes) // checking one output is enough
-		if err != nil {
-			return errors.New("internal rng failed")
-		}
-		isZero = mapToFr(x, bytes)
+		isZero = randFr(x, rand)
 	}
-	return nil
 }
 
 // mapToFr reads a scalar from a slice of bytes and maps it to Zr.
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index cf0c37d7856..51eaa744284 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -10,16 +10,20 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+
+	"github.com/onflow/flow-go/crypto/random"
 )
 
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMultG1G2(b *testing.B) {
 
-	seed := make([]byte, securityBits/8)
+	seed := make([]byte, random.Chacha20SeedLen)
 	_, _ = rand.Read(seed)
-	_ = seedRelic(seed)
+	prg, err := random.NewChacha20PRG(seed, nil)
+	require.NoError(b, err)
+
 	var expo scalar
-	_ = randFr(&expo)
+	_ = randFr(&expo, prg)
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 8f28d048b63..1d19ca42504 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -533,11 +533,13 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) {
 //
 // The function returns :
 //   - (nil, nil, nil, invalidInputsErrorf) if:
+//   - seed is too short
 //   - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`]
 //   - threshold value is not in interval [1, n-1]
 //   - (groupPrivKey, []pubKeyShares, groupPubKey, nil) otherwise
 func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	[]PublicKey, PublicKey, error) {
+
 	if size < ThresholdSignMinSize || size > ThresholdSignMaxSize {
 		return nil, nil, nil, invalidInputsErrorf(
 			"size should be between %d and %d, got %d",
@@ -559,20 +561,11 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
 	var X0 pointE2
 
 	// Generate a polynomial P in Fr[X] of degree t
-	a := make([]scalar, threshold+1)
-	if err := randFrStar(&a[0]); err != nil { // non-identity key
-		return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
-	}
-	if threshold > 0 {
-		for i := 1; i < threshold; i++ {
-			if err := randFr(&a[i]); err != nil {
-				return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
-			}
-		}
-		if err := randFrStar(&a[threshold]); err != nil { // enforce the polynomial degree
-			return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err)
-		}
+	a, err := generateFrPolynomial(seed, threshold)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("failed to generate random polynomial: %w", err)
 	}
+
 	// compute the shares
 	for i := index(1); int(i) <= size; i++ {
 		C.Fr_polynomial_image(
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index a10f482cceb..ce88c460f95 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -5,8 +5,7 @@
 
 #include "bls_include.h"
 
-int         E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
-extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
-extern void Fr_generate_polynomial(Fr* a);
+int             E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
+extern void     Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 6d873da6e68..f04b199732b 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -21,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) {
 	t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI)
 	// stateful API
 	t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI)
-	//t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
-	//t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
+	t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS)
+	t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case
 }
 
 const thresholdSignatureTag = "random tag"
@@ -546,67 +546,68 @@ type statelessKeys struct {
 // Centralized test of threshold signature protocol using the threshold key generation.
 func testCentralizedStatelessAPI(t *testing.T) {
 	n := 10
-	for threshold := MinimumThreshold; threshold < n; threshold++ {
-		// generate threshold keys
-		r := time.Now().UnixNano()
-		mrand.Seed(r)
-		t.Log(r)
-		seed := make([]byte, SeedMinLenDKG)
-		_, err := mrand.Read(seed)
-		require.NoError(t, err)
-		skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed)
-		require.NoError(t, err)
-		// signature hasher
-		kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag)
-		// generate signature shares
-		signShares := make([]Signature, 0, n)
-		signers := make([]int, 0, n)
-		// fill the signers list and shuffle it
-		for i := 0; i < n; i++ {
-			signers = append(signers, i)
-		}
-		mrand.Shuffle(n, func(i, j int) {
-			signers[i], signers[j] = signers[j], signers[i]
-		})
-		// create (t+1) signatures of the first randomly chosen signers
-		for j := 0; j < threshold+1; j++ {
-			i := signers[j]
-			share, err := skShares[i].Sign(thresholdSignatureMessage, kmac)
-			require.NoError(t, err)
-			verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac)
-			require.NoError(t, err)
-			assert.True(t, verif, "signature share is not valid")
-			if verif {
-				signShares = append(signShares, share)
-			}
-		}
-		// reconstruct and test the threshold signature
-		thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
+	threshold := 6
+	//for threshold := MinimumThreshold; threshold < n; threshold++ {
+	// generate threshold keys
+	r := time.Now().UnixNano()
+	mrand.Seed(r)
+	t.Log(r)
+	seed := make([]byte, SeedMinLenDKG)
+	_, err := mrand.Read(seed)
+	require.NoError(t, err)
+	skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed)
+	require.NoError(t, err)
+	// signature hasher
+	kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag)
+	// generate signature shares
+	signShares := make([]Signature, 0, n)
+	signers := make([]int, 0, n)
+	// fill the signers list and shuffle it
+	for i := 0; i < n; i++ {
+		signers = append(signers, i)
+	}
+	mrand.Shuffle(n, func(i, j int) {
+		signers[i], signers[j] = signers[j], signers[i]
+	})
+	// create (t+1) signatures of the first randomly chosen signers
+	for j := 0; j < threshold+1; j++ {
+		i := signers[j]
+		share, err := skShares[i].Sign(thresholdSignatureMessage, kmac)
 		require.NoError(t, err)
-		verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac)
+		verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac)
 		require.NoError(t, err)
 		assert.True(t, verif, "signature share is not valid")
-
-		// check failure with a random redundant signer
-		if threshold > 1 {
-			randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold
-			tmp := signers[randomDuplicate]
-			signers[randomDuplicate] = signers[0]
-			thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
-			assert.Error(t, err)
-			assert.True(t, IsDuplicatedSignerError(err))
-			assert.Nil(t, thresholdSignature)
-			signers[randomDuplicate] = tmp
+		if verif {
+			signShares = append(signShares, share)
 		}
+	}
+	// reconstruct and test the threshold signature
+	thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
+	require.NoError(t, err)
+	verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac)
+	require.NoError(t, err)
+	assert.True(t, verif, "signature share is not valid")
 
-		// check with an invalid signature (invalid serialization)
-		invalidSig := make([]byte, signatureLengthBLSBLS12381)
-		signShares[0] = invalidSig
+	// check failure with a random redundant signer
+	if threshold > 1 {
+		randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold
+		tmp := signers[randomDuplicate]
+		signers[randomDuplicate] = signers[0]
 		thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
 		assert.Error(t, err)
-		assert.True(t, IsInvalidSignatureError(err))
+		assert.True(t, IsDuplicatedSignerError(err))
 		assert.Nil(t, thresholdSignature)
+		signers[randomDuplicate] = tmp
 	}
+
+	// check with an invalid signature (invalid serialization)
+	invalidSig := make([]byte, signatureLengthBLSBLS12381)
+	signShares[0] = invalidSig
+	thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
+	assert.Error(t, err)
+	assert.True(t, IsInvalidSignatureError(err))
+	assert.Nil(t, thresholdSignature)
+	//}
 }
 
 func BenchmarkSimpleKeyGen(b *testing.B) {
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index a8c0c976382..48d1f72f752 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -2,65 +2,6 @@
 
 #include "dkg_include.h"
 
-
-// HKDF is used to extract and expand entropy
-// `hkdf_ctx` holds the context of a HKDF instance
-#include "keygen.c" // imported here in order to import BLST's `HMAC_SHA256_CTX`
-typedef struct {
-        HMAC_SHA256_CTX hmac_ctx;   // HMAC context
-        byte prk[32];               // pseudo-random key used by HKDF 
-} hkdf_ctx;
-
-// instanciate a HKDF to extract entropy from `ikm`. 
-static hkdf_ctx* get_hkdf_ctx(const byte* ikm, const int ikm_len) {
-    hkdf_ctx* ctx = (hkdf_ctx*) malloc(sizeof(hkdf_ctx));
-    HKDF_Extract(ctx->prk, NULL, 0, ikm, ikm_len, 0, &ctx->hmac_ctx);
-    return ctx;
-}
-
-// expand entropy from a HKDF instance
-static void expand_entropy(byte* dest, const int len, hkdf_ctx* ctx) {
-    HKDF_Expand(dest, len, ctx->prk, NULL, 0, 0, &ctx->hmac_ctx);
-}
-
-// generate a polynomial P = a_0 + a_1*x + .. + a_n x^n in F_r
-// where degree `n` is input `degree` (higher degree monomial in non-zero).
-// P also guarantees `a_0` is non zero (for single dealer BLS-DKGs, this insures
-// protocol public key output is not identity).
-//
-// `seed` is used as the source of entropy of the secret polynomial. 
-// `seed_len` is required to be at least 16, and it is not checked in the function.
-void  Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len) {
-    // use HKDF to expand `seed` into the needed bytes
-    hkdf_ctx* ctx = get_hkdf_ctx(seed, seed_len);
-    // bytes of each coefficient a_i
-    // use extra 128 bits to reduce the modular reduction bias (128 is half of Fr_BYTES)
-    const int coef_bytes_len = Fr_BYTES + Fr_BYTES/2;
-    byte coef_bytes[coef_bytes_len];
-
-    // generate a_0 in F_r*
-    bool_t is_zero = 1;
-    while (is_zero) {
-        expand_entropy(coef_bytes, coef_bytes_len, ctx);
-        is_zero = map_bytes_to_Fr(&a[0], coef_bytes, coef_bytes_len);
-    }
-    
-    if (degree > 1) {
-        // genarate a_i on F_r, for 0<i<degree
-        for (int i=0; i < degree; i++) {
-            expand_entropy(coef_bytes, coef_bytes_len, ctx);
-            map_bytes_to_Fr(&a[i], coef_bytes, coef_bytes_len);
-        }
-        // generate a_degree in F_r* to enforce P's degree
-        is_zero = 1;
-        while (is_zero) {
-            expand_entropy(coef_bytes, coef_bytes_len, ctx);
-            is_zero = map_bytes_to_Fr(&a[degree], coef_bytes, coef_bytes_len);
-        }
-    }
-    free(ctx);
-}
-
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r
 // where `x` is a small integer (byte) and `degree` is P's degree n.
 // P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL.
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 61d2e565d63..f247b9bc491 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -9,6 +9,9 @@ import "C"
 
 import (
 	"fmt"
+
+	"github.com/onflow/flow-go/crypto/hash"
+	"github.com/onflow/flow-go/crypto/random"
 )
 
 // Implements Feldman Verifiable Secret Sharing using
@@ -90,6 +93,7 @@ func (s *feldmanVSSstate) init() {
 // If the current participant is not the dealer, the seed is ignored.
 //
 // The function returns:
+// - invalidInputError if seed is too short
 // - dkgInvalidStateTransitionError if the DKG instance is already running.
 // - error if an unexpected exception occurs
 // - nil otherwise
@@ -255,30 +259,63 @@ func (s *feldmanVSSstate) ForceDisqualify(participant int) error {
 	return nil
 }
 
+// generate a pseudo-random polynomial P = a_0 + a_1*x + .. + a_n x^n in Fr[X]
+// where `n` is the input `degree` (higher degree monomial in non-zero).
+// `a_0` is also non-zero (for single dealer BLS-DKGs, this insures
+// protocol public key output is not identity).
+// `seed` is used as the entropy source and must be at least `SeedMinLenDKG`
+// random bytes with at least 128 bits entropy.
+func generateFrPolynomial(seed []byte, degree int) ([]scalar, error) {
+	if len(seed) < SeedMinLenDKG {
+		return nil, invalidInputsErrorf(
+			"seed should be at least %d bytes, got %d", SeedMinLenDKG, len(seed))
+	}
+
+	// build a PRG out of the seed
+	// In this case, SHA3 is used to smoothen the seed and Chacha20 is used as a PRG
+	var prgSeed [random.Chacha20SeedLen]byte
+	hash.ComputeSHA3_256(&prgSeed, seed)
+	prg, err := random.NewChacha20PRG(prgSeed[:], []byte("gen_poly"))
+	if err != nil {
+		return nil, fmt.Errorf("instanciating the PRG failed: %w", err)
+	}
+
+	// P's coefficients
+	a := make([]scalar, degree+1)
+
+	// generate a_0 in F_r*
+	randFrStar(&a[0], prg)
+	if degree > 0 {
+		// genarate a_i on F_r, for 0<i<degree
+		for i := 1; i < degree; i++ {
+			_ = randFr(&a[i], prg)
+		}
+		// generate a_degree in F_r* to enforce P's degree
+		randFrStar(&a[degree], prg)
+	}
+	return a, nil
+}
+
 // generateShares is used by the dealer to generate secret polynomial from the input seed
 // and derive all private shares and public data.
 func (s *feldmanVSSstate) generateShares(seed []byte) error {
-	// Generate a polyomial P in Fr[X] of degree t
-	s.a = make([]scalar, s.threshold+1)
-	s.vA = make([]pointE2, s.threshold+1)
+
 	s.y = make([]pointE2, s.size)
-	// non-zero a[0] - group private key is not zero
-	if err := randFrStar(&s.a[0]); err != nil {
-		return fmt.Errorf("generating the polynomial failed: %w", err)
-	}
-	generatorScalarMultG2(&s.vA[0], &s.a[0])
-	if s.threshold > 0 {
-		for i := 1; i < s.threshold; i++ {
-			if err := randFr(&s.a[i]); err != nil {
-				return fmt.Errorf("generating the polynomial failed: %w", err)
-			}
-			generatorScalarMultG2(&s.vA[i], &s.a[i])
-		}
-		// non-zero a[t] to enforce the polynomial degree
-		if err := randFrStar(&s.a[s.threshold]); err != nil {
-			return fmt.Errorf("generating the polynomial failed: %w", err)
-		}
-		generatorScalarMultG2(&s.vA[s.threshold], &s.a[s.threshold])
+
+	// Generate a random polyomial P in Fr[X] of degree t (coefficients are a_i)
+	// `s.a` are the coefficients of P
+	//  - a_degree is non-zero as deg(P) = degree
+	//  - `a_0` is non-zero to make sure BLS-DKG public key is non-identity
+	var err error
+	s.a, err = generateFrPolynomial(seed, s.threshold)
+	if err != nil {
+		return fmt.Errorf("failed to generate random polynomial: %w", err)
+	}
+
+	// compute the verification vector A_i = g2^a_i
+	s.vA = make([]pointE2, s.threshold+1)
+	for i := 0; i <= s.threshold; i++ {
+		generatorScalarMultG2(&s.vA[i], &s.a[i])
 	}
 
 	// compute the shares
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 6e5c9241638..e8489fbf669 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -5,7 +5,6 @@
 
 #include "bls12381_utils.h"
 
-void        Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len);
 void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int deg, const byte x);
 void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, const byte x);
 void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg);

From 3d26f0cf870bb3cb6f7b213e5e0bdc026edfc94d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 19 Apr 2023 14:24:00 -0600
Subject: [PATCH 040/200] unify seed lengths of DKG and other keyGen seed
 lengths

---
 crypto/bls.go                    |   3 +-
 crypto/bls12381_utils_test.go    |   9 +--
 crypto/bls_thresholdsign_test.go | 119 +++++++++++++++----------------
 crypto/dkg.go                    |   3 -
 crypto/dkg_feldmanvss.go         |   6 +-
 crypto/dkg_test.go               |   6 +-
 6 files changed, 70 insertions(+), 76 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 34281e0aab5..0cec3458bbf 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -293,7 +293,8 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) {
 		}
 		defer overwrite(okm) // overwrite okm
 
-		// map the bytes to a private key : SK = OS2IP(OKM) mod r
+		// map the bytes to a private key using modular reduction 
+		// SK = OS2IP(OKM) mod r
 		isZero := mapToFr(&sk.scalar, okm)
 		if !isZero {
 			return sk, nil
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 51eaa744284..ed72a5ec84b 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -10,20 +10,17 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-
-	"github.com/onflow/flow-go/crypto/random"
 )
 
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMultG1G2(b *testing.B) {
 
-	seed := make([]byte, random.Chacha20SeedLen)
-	_, _ = rand.Read(seed)
-	prg, err := random.NewChacha20PRG(seed, nil)
+	seed := make([]byte, frBytesLen)
+	_, err := rand.Read(seed)
 	require.NoError(b, err)
 
 	var expo scalar
-	_ = randFr(&expo, prg)
+	_ = mapToFr(&expo, seed)
 
 	// G1 generator multiplication
 	b.Run("G1 gen", func(b *testing.B) {
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index f04b199732b..dfcba3ecccb 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -37,7 +37,7 @@ func testCentralizedStatefulAPI(t *testing.T) {
 	n := 10
 	for threshold := MinimumThreshold; threshold < n; threshold++ {
 		// generate threshold keys
-		seed := make([]byte, SeedMinLenDKG)
+		seed := make([]byte, KeyGenSeedMinLen)
 		_, err := mrand.Read(seed)
 		require.NoError(t, err)
 		skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed)
@@ -346,9 +346,9 @@ func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) {
 		chans[i] = make(chan *message, 2*n)
 	}
 	// start DKG in all participants
-	seed := make([]byte, SeedMinLenDKG)
+	seed := make([]byte, KeyGenSeedMinLen)
 	read, err := rand.Read(seed)
-	require.Equal(t, read, SeedMinLenDKG)
+	require.Equal(t, read, KeyGenSeedMinLen)
 	require.NoError(t, err)
 	sync.Add(n)
 	for current := 0; current < n; current++ {
@@ -405,9 +405,9 @@ func testDistributedStatefulAPI_JointFeldman(t *testing.T) {
 			chans[i] = make(chan *message, 2*n)
 		}
 		// start DKG in all participants but the
-		seed := make([]byte, SeedMinLenDKG)
+		seed := make([]byte, KeyGenSeedMinLen)
 		read, err := rand.Read(seed)
-		require.Equal(t, read, SeedMinLenDKG)
+		require.Equal(t, read, KeyGenSeedMinLen)
 		require.NoError(t, err)
 		sync.Add(n)
 		for current := 0; current < n; current++ {
@@ -546,73 +546,72 @@ type statelessKeys struct {
 // Centralized test of threshold signature protocol using the threshold key generation.
 func testCentralizedStatelessAPI(t *testing.T) {
 	n := 10
-	threshold := 6
-	//for threshold := MinimumThreshold; threshold < n; threshold++ {
-	// generate threshold keys
-	r := time.Now().UnixNano()
-	mrand.Seed(r)
-	t.Log(r)
-	seed := make([]byte, SeedMinLenDKG)
-	_, err := mrand.Read(seed)
-	require.NoError(t, err)
-	skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed)
-	require.NoError(t, err)
-	// signature hasher
-	kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag)
-	// generate signature shares
-	signShares := make([]Signature, 0, n)
-	signers := make([]int, 0, n)
-	// fill the signers list and shuffle it
-	for i := 0; i < n; i++ {
-		signers = append(signers, i)
-	}
-	mrand.Shuffle(n, func(i, j int) {
-		signers[i], signers[j] = signers[j], signers[i]
-	})
-	// create (t+1) signatures of the first randomly chosen signers
-	for j := 0; j < threshold+1; j++ {
-		i := signers[j]
-		share, err := skShares[i].Sign(thresholdSignatureMessage, kmac)
+	for threshold := MinimumThreshold; threshold < n; threshold++ {
+		// generate threshold keys
+		r := time.Now().UnixNano()
+		mrand.Seed(r)
+		t.Log(r)
+		seed := make([]byte, KeyGenSeedMinLen)
+		_, err := mrand.Read(seed)
+		require.NoError(t, err)
+		skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed)
+		require.NoError(t, err)
+		// signature hasher
+		kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag)
+		// generate signature shares
+		signShares := make([]Signature, 0, n)
+		signers := make([]int, 0, n)
+		// fill the signers list and shuffle it
+		for i := 0; i < n; i++ {
+			signers = append(signers, i)
+		}
+		mrand.Shuffle(n, func(i, j int) {
+			signers[i], signers[j] = signers[j], signers[i]
+		})
+		// create (t+1) signatures of the first randomly chosen signers
+		for j := 0; j < threshold+1; j++ {
+			i := signers[j]
+			share, err := skShares[i].Sign(thresholdSignatureMessage, kmac)
+			require.NoError(t, err)
+			verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac)
+			require.NoError(t, err)
+			assert.True(t, verif, "signature share is not valid")
+			if verif {
+				signShares = append(signShares, share)
+			}
+		}
+		// reconstruct and test the threshold signature
+		thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
 		require.NoError(t, err)
-		verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac)
+		verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac)
 		require.NoError(t, err)
 		assert.True(t, verif, "signature share is not valid")
-		if verif {
-			signShares = append(signShares, share)
+
+		// check failure with a random redundant signer
+		if threshold > 1 {
+			randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold
+			tmp := signers[randomDuplicate]
+			signers[randomDuplicate] = signers[0]
+			thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
+			assert.Error(t, err)
+			assert.True(t, IsDuplicatedSignerError(err))
+			assert.Nil(t, thresholdSignature)
+			signers[randomDuplicate] = tmp
 		}
-	}
-	// reconstruct and test the threshold signature
-	thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
-	require.NoError(t, err)
-	verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac)
-	require.NoError(t, err)
-	assert.True(t, verif, "signature share is not valid")
 
-	// check failure with a random redundant signer
-	if threshold > 1 {
-		randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold
-		tmp := signers[randomDuplicate]
-		signers[randomDuplicate] = signers[0]
+		// check with an invalid signature (invalid serialization)
+		invalidSig := make([]byte, signatureLengthBLSBLS12381)
+		signShares[0] = invalidSig
 		thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
 		assert.Error(t, err)
-		assert.True(t, IsDuplicatedSignerError(err))
+		assert.True(t, IsInvalidSignatureError(err))
 		assert.Nil(t, thresholdSignature)
-		signers[randomDuplicate] = tmp
 	}
-
-	// check with an invalid signature (invalid serialization)
-	invalidSig := make([]byte, signatureLengthBLSBLS12381)
-	signShares[0] = invalidSig
-	thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
-	assert.Error(t, err)
-	assert.True(t, IsInvalidSignatureError(err))
-	assert.Nil(t, thresholdSignature)
-	//}
 }
 
 func BenchmarkSimpleKeyGen(b *testing.B) {
 	n := 60
-	seed := make([]byte, SeedMinLenDKG)
+	seed := make([]byte, KeyGenSeedMinLen)
 	_, _ = rand.Read(seed)
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
@@ -623,7 +622,7 @@ func BenchmarkSimpleKeyGen(b *testing.B) {
 
 func BenchmarkSignatureReconstruction(b *testing.B) {
 	n := 60
-	seed := make([]byte, SeedMinLenDKG)
+	seed := make([]byte, KeyGenSeedMinLen)
 	_, _ = rand.Read(seed)
 	threshold := 40
 	// generate threshold keys
diff --git a/crypto/dkg.go b/crypto/dkg.go
index 1cdf87a128e..03305d016c7 100644
--- a/crypto/dkg.go
+++ b/crypto/dkg.go
@@ -34,9 +34,6 @@ const (
 	DKGMinSize int = MinimumThreshold + 1
 	// DKGMaxSize is the maximum size of a group participating in a DKG protocol
 	DKGMaxSize int = 254
-	// SeedMinLenDKG is the minumum seed length required to participate in a DKG protocol
-	SeedMinLenDKG = securityBits / 8
-	SeedMaxLenDKG = maxRelicPrgSeed
 )
 
 type DKGState interface {
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index f247b9bc491..64f2a11c383 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -263,12 +263,12 @@ func (s *feldmanVSSstate) ForceDisqualify(participant int) error {
 // where `n` is the input `degree` (higher degree monomial in non-zero).
 // `a_0` is also non-zero (for single dealer BLS-DKGs, this insures
 // protocol public key output is not identity).
-// `seed` is used as the entropy source and must be at least `SeedMinLenDKG`
+// `seed` is used as the entropy source and must be at least `KeyGenSeedMinLen`
 // random bytes with at least 128 bits entropy.
 func generateFrPolynomial(seed []byte, degree int) ([]scalar, error) {
-	if len(seed) < SeedMinLenDKG {
+	if len(seed) < KeyGenSeedMinLen {
 		return nil, invalidInputsErrorf(
-			"seed should be at least %d bytes, got %d", SeedMinLenDKG, len(seed))
+			"seed should be at least %d bytes, got %d", KeyGenSeedMinLen, len(seed))
 	}
 
 	// build a PRG out of the seed
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index a35d259f4f2..fc8f730e779 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -293,9 +293,9 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) {
 
 	// start DKG in all participants
 	// start listening on the channels
-	seed := make([]byte, SeedMinLenDKG)
+	seed := make([]byte, KeyGenSeedMinLen)
 	read, err := mrand.Read(seed)
-	require.Equal(t, read, SeedMinLenDKG)
+	require.Equal(t, read, KeyGenSeedMinLen)
 	require.NoError(t, err)
 	sync.Add(n)
 
@@ -771,7 +771,7 @@ func TestDKGTransitionErrors(t *testing.T) {
 	threshold := 3
 	myIndex := 0
 	dealer := 1
-	seed := make([]byte, SeedMinLenDKG)
+	seed := make([]byte, KeyGenSeedMinLen)
 
 	t.Run("feldman VSS", func(t *testing.T) {
 		state, err := NewFeldmanVSS(n, threshold, myIndex, dummyTestDKGProcessor{}, dealer)

From 768602c176312dad7d8a5b441f71d4fa81d0c0a2 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 20 Apr 2023 00:58:14 -0600
Subject: [PATCH 041/200] update randG2 to map to G2 and update membership
 check in G2 tests

---
 crypto/bls12381_utils.c         | 93 ++++++++++++++++++---------------
 crypto/bls12381_utils.go        | 34 ++++--------
 crypto/bls12381_utils.h         | 20 +++----
 crypto/bls12381_utils_test.go   | 67 +++++++++++++++---------
 crypto/bls_thresholdsign_core.c |  4 +-
 5 files changed, 112 insertions(+), 106 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 9518320d051..9ab5fb58d91 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -53,16 +53,6 @@ ctx_t* relic_init_BLS12_381() {
     return core_get();
 }
 
-// seeds relic PRG
-void seed_relic(byte* seed, int len) {
-    #if RAND == HASHD
-    // instantiate a new DRBG
-    ctx_t *ctx = core_get();
-    ctx->seeded = 0;
-    #endif
-    rand_seed(seed, len);
-}
-
 // global variable of the pre-computed data
 prec_st bls_prec_st;
 prec_st* bls_prec = NULL;
@@ -128,7 +118,7 @@ prec_st* init_precomputed_data_BLS12_381() {
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
-const limb_t BLS12_381_rR[Fr_LIMBS] = {  /* (1<<256)%r */
+const Fr BLS12_381_rR = {                   /* R mod r = (1<<256)%r */
     TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5),
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe)
 };
@@ -346,7 +336,7 @@ void Fr_write_bytes(uint8_t *bin, const Fr* a) {
 
 // maps big-endian bytes into an Fr element using modular reduction
 // Input is byte-big-endian, output is vec256 (also used as Fr)
-static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
+static void Fr_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
 {
     Fr digit, radix;
     Fr_set_zero(out);
@@ -375,14 +365,14 @@ static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
 bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
-    vec256_from_be_bytes(a, bin, len);
+    Fr_from_be_bytes(a, bin, len);
     return Fr_is_zero(a);
 }
 
 // ------------------- Fp utilities
 
-// Montgomery constant R related to the prime p
-const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P };  /* (1<<384)%p */
+// Montgomery constants related to the prime p
+const Fp BLS12_381_pR = { ONE_MONT_P };        /* R mod p = (1<<384)%p */
 
 // sets `a` to 0
 void Fp_set_zero(Fp* a){
@@ -1248,14 +1238,21 @@ int bowe_subgroup_check_G1(const ep_t p){
 }
 #endif
 
-// generates a random point in G1 and stores it in p
-void ep_rand_G1(ep_t p) {
+/*
+// maps the bytes to a point in G1
+// this is a testing file only, should not be used in any protocol!
+void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) {
+    // map to Fr
+    Fr log;
+    map_bytes_to_Fr(&log, bytes, len);
     // multiplies G1 generator by a random scalar
-    ep_rand(p);
+
+    
 }
 
-// generates a random point in E1\G1 and stores it in p
-void ep_rand_G1complement(ep_t p) {
+// generates a point in E1\G1 and stores it in p
+// this is a testing file only, should not be used in any protocol!
+void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) {
     // generate a random point in E1
     p->coord = BASIC;
     fp_set_dig(p->z, 1);
@@ -1273,32 +1270,46 @@ void ep_rand_G1complement(ep_t p) {
 
     assert(ep_on_curve(p));  // sanity check to make sure p is in E1
 }
+*/
 
-// generates a random point in G2 and stores it in p
-void ep2_rand_G2(ep2_t p) {
+// maps the bytes to a point in G2.
+// `len` should be at least Fr_BYTES.
+// this is a testing tool only, it should not be used in any protocol!
+void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) {
+    assert(len > Fr_BYTES);
+    // map to Fr
+    Fr log;
+    map_bytes_to_Fr(&log, bytes, len);
     // multiplies G2 generator by a random scalar
-    ep2_rand(p);
-}
-
-// generates a random point in E2\G2 and stores it in p
-void ep2_rand_G2complement(ep2_t p) {
-    // generate a random point in E2
-    p->coord = BASIC;
-    fp_set_dig(p->z[0], 1);
-	fp_zero(p->z[1]);
-    do {
-        fp2_rand(p->x); // set x to a random field element
-        byte r;
-        rand_bytes(&r, 1);
-        fp2_zero(p->y);
-        fp_set_bit(p->y[0], 0, r&1); // set y randomly to 0 or 1
+    G2_mult_gen(p, &log);
+}
+
+// attempts to map `bytes` to a point in E2\G2 and stores it in p.
+// `len` should be at least G2_SER_BYTES. It returns BLST_SUCCESS only if mapping 
+// succeeds.
+// For now, function only works when E2 serialization is compressed.
+// this is a testing tool only, it should not be used in any protocol!
+BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) {
+    assert(G2_SERIALIZATION == COMPRESSED);
+    assert(len >= G2_SER_BYTES);
+
+    // attempt to deserilize a compressed E2 point from input bytes
+    // after fixing the header 2 bits
+    byte copy[G2_SER_BYTES];
+    memcpy(copy, bytes, sizeof(copy));
+    copy[0] |= 1<<7;        // set compression bit
+    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
+
+    BLST_ERROR ser = E2_read_bytes(p, copy, len);
+    if (ser != BLST_SUCCESS) {
+        return ser;
     }
-    while (ep2_upk(p, p) == 0); // make sure p is in E1
 
-    // map the point to E1\G1 by clearing G1 order
-    ep2_mul_basic(p, p, &core_get()->ep_r);
+    // map the point to E2\G2 by clearing G2 order
+    E2_mult(p, p, (const Fr*)BLS12_381_r);
 
-    assert(ep2_on_curve(p));  // sanity check to make sure p is in E1
+    assert(E2_affine_on_curve(p));  // sanity check to make sure p is in E2
+    return BLST_SUCCESS;
 }
 
 // This is a testing function.
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 0756f09472e..b6f822a6b1a 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -62,23 +62,6 @@ func (ct *ctx) initContext() error {
 	return nil
 }
 
-// seeds the internal relic random function.
-// relic context must be initialized before seeding.
-func seedRelic(seed []byte) error {
-	if len(seed) < (securityBits / 8) {
-		return invalidInputsErrorf(
-			"seed length needs to be larger than %d",
-			securityBits/8)
-	}
-	if len(seed) > maxRelicPrgSeed {
-		return invalidInputsErrorf(
-			"seed length needs to be less than %x",
-			maxRelicPrgSeed)
-	}
-	C.seed_relic((*C.uchar)(&seed[0]), (C.int)(len(seed)))
-	return nil
-}
-
 // Exponentiation in G1 (scalar point multiplication)
 func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) {
 	C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo))
@@ -250,6 +233,7 @@ func checkMembershipG2(pt *pointE2) int {
 	return int(C.G2_check_membership((*C.E2)(pt)))
 }
 
+/*
 // randPointG1 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G1 and stores it in input point.
 func randPointG1(pt *pointE1) {
@@ -261,20 +245,20 @@ func randPointG1(pt *pointE1) {
 func randPointG1Complement(pt *pointE1) {
 	C.ep_rand_G1complement((*C.ep_st)(pt))
 }
+*/
 
-/*
-// randPointG2 wraps a call to C since cgo can't be used in go test files.
+// mapToG2 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G2 and stores it in input point.
-func randPointG2(pt *pointE2) {
-	C.ep2_rand_G2((*C.E2)(pt))
+func mapToG2(pt *pointE2, src []byte) {
+	C.map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src)))
 }
 
-// randPointG1Complement wraps a call to C since cgo can't be used in go test files.
+// mapToG2Complement wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func randPointG2Complement(pt *pointE2) {
-	C.ep2_rand_G2complement((*C.E2)(pt))
+func mapToG2Complement(pt *pointE2, src []byte) bool {
+	res := C.map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src)))
+	return int(res) == blst_valid
 }
-*/
 
 // This is only a TEST function.
 // It hashes `data` to a G1 point using the tag `dst` and returns the G1 point serialization.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 3e4c84ed43f..831bce5c62f 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -96,7 +96,7 @@ int bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 void     map_to_G1(ep_t, const byte*, const int);
 
 // Fr utilities
-extern const limb_t BLS12_381_rR[Fr_LIMBS];
+extern const Fr BLS12_381_rR;
 bool_t      Fr_is_zero(const Fr* a);
 bool_t      Fr_is_equal(const Fr* a, const Fr* b);
 void        Fr_set_limb(Fr*, const limb_t);
@@ -130,8 +130,8 @@ void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
 int      G1_check_membership(const ep_t);
 int      G1_simple_subgroup_check(const ep_t);
-void     ep_rand_G1(ep_t);
-void     ep_rand_G1complement( ep_t);
+void     map_bytes_to_G1(E1*, const uint8_t*, int);
+void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
 #if  (MEMBERSHIP_CHECK_G1 == BOWE)
 int      bowe_subgroup_check_G1(const ep_t);
 #endif
@@ -150,20 +150,16 @@ void        E2_mult(E2*, const E2*, const Fr*);
 void        E2_mult_small_expo(E2*, const E2*, const byte);
 void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_sum_vector(E2*, const E2*, const int);
-
-void     ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); 
-
-void     E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
-int      G2_check_membership(const E2*);
-int      G2_simple_subgroup_check(const ep2_t);
-void     ep2_rand_G2(ep2_t);
-void     ep2_rand_G2complement( ep2_t);
+void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
+int         G2_check_membership(const E2*);
+int         G2_simple_subgroup_check(const ep2_t);
+void        map_bytes_to_G2(E2*, const uint8_t*, int);
+BLST_ERROR  map_bytes_to_G2complement(E2*, const uint8_t*, int);
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
 prec_st* init_precomputed_data_BLS12_381();
 void     precomputed_data_set(const prec_st* p);
-void     seed_relic(byte*, int);
 
 // utility testing function
 void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index ed72a5ec84b..7389dfa1454 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -6,7 +6,9 @@ package crypto
 import (
 	"crypto/rand"
 	"encoding/hex"
+	mrand "math/rand"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -102,10 +104,9 @@ func BenchmarkMapToG1(b *testing.B) {
 
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
-	// seed Relic PRG
-	seed := make([]byte, securityBits/8)
-	_, _ = rand.Read(seed)
-	_ = seedRelic(seed)
+	r := time.Now().UnixNano()
+	mrand.Seed(r)
+	t.Logf("math rand seed is %d", r)
 
 	/*t.Run("G1", func(t *testing.T) {
 		var p pointE1
@@ -115,24 +116,34 @@ func TestSubgroupCheck(t *testing.T) {
 		randPointG1Complement(&p) // point in E1\G1
 		res = checkMembershipG1(&p)
 		assert.Equal(t, res, int(invalid))
+	})*/
+
+	t.Run("G2", func(t *testing.T) {
+		t.Skip() // TODO: fix membership check in G2 and update
+		var p pointE2
+		seed := make([]byte, PubKeyLenBLSBLS12381)
+		_, err := mrand.Read(seed)
+		require.NoError(t, err)
+		mapToG2(&p, seed) // point in G2
+		res := checkMembershipG2(&p)
+		assert.Equal(t, res, int(valid))
+
+		inG2 := false
+		for !inG2 {
+			_, err := mrand.Read(seed)
+			require.NoError(t, err)
+			inG2 = mapToG2Complement(&p, seed) // point in E2\G2
+		}
+		res = checkMembershipG2(&p)
+		assert.Equal(t, res, int(invalid))
 	})
 
-		t.Run("G2", func(t *testing.T) {
-			var p pointE2
-			randPointG2(&p) // point in G2
-			res := checkMembershipG2(&p)
-			assert.Equal(t, res, int(valid))
-			randPointG2Complement(&p) // point in E2\G2
-			res = checkMembershipG2(&p)
-			assert.Equal(t, res, int(invalid))
-		})
-	*/
 }
 
 // subgroup membership check bench
 func BenchmarkSubgroupCheck(b *testing.B) {
 
-	b.Run("G1", func(b *testing.B) {
+	/*b.Run("G1", func(b *testing.B) {
 		var p pointE1
 		randPointG1(&p)
 		b.ResetTimer()
@@ -140,16 +151,20 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 			_ = checkMembershipG1(&p) // G1
 		}
 		b.StopTimer()
+	})*/
+
+	b.Run("G2", func(b *testing.B) {
+		var p pointE2
+		seed := make([]byte, PubKeyLenBLSBLS12381)
+		_, err := mrand.Read(seed)
+		require.NoError(b, err)
+		mapToG2(&p, seed) // point in G2
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = checkMembershipG2(&p) // G2
+		}
+		b.StopTimer()
 	})
-	/*
-		b.Run("G2", func(b *testing.B) {
-			var p pointE2
-			randPointG2(&p)
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				_ = checkMembershipG2(&p) // G2
-			}
-			b.StopTimer()
-		})
-	*/
+
 }
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 96d07f2a42e..777af1ef5e9 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -16,8 +16,8 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice
     Fr denominator; // eventually would represent D*R^k 
 
     // Initialize N and D to Montgomery constant R
-    Fr_copy(&numerator, (Fr*)BLS12_381_rR);
-    Fr_copy(&denominator, (Fr*)BLS12_381_rR);
+    Fr_copy(&numerator, &BLS12_381_rR);
+    Fr_copy(&denominator, &BLS12_381_rR);
 
     // sign of D: 0 for positive and 1 for negative
     int sign = 0; 

From 21676845b5f728c4e2ebc9d1bb052d4ecbf7b8f2 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 20 Apr 2023 11:48:22 -0600
Subject: [PATCH 042/200] membership check in G2 using BLST

---
 crypto/bls.go                 |  4 ++--
 crypto/bls12381_utils.c       | 27 +++++++++----------------
 crypto/bls12381_utils.go      |  9 +++++----
 crypto/bls12381_utils.h       |  5 ++---
 crypto/bls12381_utils_test.go |  8 ++------
 crypto/bls_core.c             | 38 ++++++++---------------------------
 crypto/bls_include.h          |  3 ---
 crypto/blst_src/blst_src.c    |  3 +--
 8 files changed, 29 insertions(+), 68 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 0cec3458bbf..5cc78190d8a 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -293,7 +293,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) {
 		}
 		defer overwrite(okm) // overwrite okm
 
-		// map the bytes to a private key using modular reduction 
+		// map the bytes to a private key using modular reduction
 		// SK = OS2IP(OKM) mod r
 		isZero := mapToFr(&sk.scalar, okm)
 		if !isZero {
@@ -353,7 +353,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 	}
 
 	// membership check in G2
-	if C.G2_check_membership((*C.E2)(&pk.point)) != valid {
+	if C.E2_in_G2((*C.E2)(&pk.point)) == (C.ulonglong)(0) {
 		return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group")
 	}
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 9ab5fb58d91..855d61ad7ac 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1013,6 +1013,13 @@ void G2_mult_gen(E2* res, const Fr* expo) {
     POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp);
 }
 
+// checks if input E2 point is on the subgroup G2.
+// It assumes input `p` is on E2.
+bool_t E2_in_G2(const E2* p){
+    // currently uses Scott method
+    return POINTonE2_in_G2((const POINTonE2*)p);
+}
+
 // computes the sum of the G2 array elements y and writes the sum in jointy
 void E2_sum_vector(E2* jointy, const E2* y, const int len){
     E2_set_infty(jointy);
@@ -1040,7 +1047,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         return read_ret;
 
     // check s1 is in G1
-    if (G1_check_membership(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
+    if (E1_in_G1(elemsG1[0]) != VALID) 
         return INVALID;
 
     // elemsG1[1] = s2
@@ -1050,7 +1057,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         return read_ret;
 
     // check s2 in G1
-    if (G1_check_membership(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1
+    if (E1_in_G1(elemsG1[1]) != VALID) 
         return INVALID; 
 
     // elemsG2[1] = pk1
@@ -1166,22 +1173,6 @@ int G1_simple_subgroup_check(const ep_t p){
     return VALID;
 }
 
-// uses a simple scalar multiplication by G1's order
-// to check whether a point on the curve E2 is in G2.
-int G2_simple_subgroup_check(const ep2_t p){
-    ep2_t inf;
-    ep2_new(inf);
-    // check p^order == infinity
-    // use basic double & add as lwnaf reduces the expo modulo r
-    ep2_mul_basic(inf, (ep2_st*)p, &core_get()->ep_r);
-    if (!ep2_is_infty(inf)){
-        ep2_free(inf);
-        return INVALID;
-    }
-    ep2_free(inf);
-    return VALID;
-}
-
 #if (MEMBERSHIP_CHECK_G1 == BOWE)
 // beta such that beta^3 == 1 mod p
 // beta is in the Montgomery form
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index b6f822a6b1a..033bcfcb20f 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -223,14 +223,15 @@ func readPointE1(a *pointE1, src []byte) error {
 
 // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used
 // in go test files.
-func checkMembershipG1(pt *pointE1) int {
-	return int(C.G1_check_membership((*C.ep_st)(pt)))
+func checkMembershipG1(pt *pointE1) bool {
+	//return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0)
+	return true
 }
 
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
 // in go test files.
-func checkMembershipG2(pt *pointE2) int {
-	return int(C.G2_check_membership((*C.E2)(pt)))
+func checkMembershipG2(pt *pointE2) bool {
+	return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0)
 }
 
 /*
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 831bce5c62f..d29dcf54c63 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -128,7 +128,7 @@ void     ep_mult_generic_bench(ep_t, const Fr*);
 void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
-int      G1_check_membership(const ep_t);
+int      E1_in_G1(const ep_t);
 int      G1_simple_subgroup_check(const ep_t);
 void     map_bytes_to_G1(E1*, const uint8_t*, int);
 void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
@@ -151,8 +151,7 @@ void        E2_mult_small_expo(E2*, const E2*, const byte);
 void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
-int         G2_check_membership(const E2*);
-int         G2_simple_subgroup_check(const ep2_t);
+bool_t      E2_in_G2(const E2*);
 void        map_bytes_to_G2(E2*, const uint8_t*, int);
 BLST_ERROR  map_bytes_to_G2complement(E2*, const uint8_t*, int);
 
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 7389dfa1454..78d08810a6f 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -119,14 +119,12 @@ func TestSubgroupCheck(t *testing.T) {
 	})*/
 
 	t.Run("G2", func(t *testing.T) {
-		t.Skip() // TODO: fix membership check in G2 and update
 		var p pointE2
 		seed := make([]byte, PubKeyLenBLSBLS12381)
 		_, err := mrand.Read(seed)
 		require.NoError(t, err)
 		mapToG2(&p, seed) // point in G2
-		res := checkMembershipG2(&p)
-		assert.Equal(t, res, int(valid))
+		assert.True(t, checkMembershipG2(&p))
 
 		inG2 := false
 		for !inG2 {
@@ -134,10 +132,8 @@ func TestSubgroupCheck(t *testing.T) {
 			require.NoError(t, err)
 			inG2 = mapToG2Complement(&p, seed) // point in E2\G2
 		}
-		res = checkMembershipG2(&p)
-		assert.Equal(t, res, int(invalid))
+		assert.False(t, checkMembershipG2(&p))
 	})
-
 }
 
 // subgroup membership check bench
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index eae1382e6a1..47e7d270546 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -21,8 +21,9 @@ int get_sk_len() {
 
 // Checks if input point p is in the subgroup G1. 
 // The function assumes the input is known to be on the curve E1.
-int G1_check_membership(const ep_t p){
-#if MEMBERSHIP_CHECK
+int E1_in_G1(const ep_t p){
+// TODO: to upadte
+/*
     #if MEMBERSHIP_CHECK_G1 == EXP_ORDER
     return G1_simple_subgroup_check(p);
     #elif MEMBERSHIP_CHECK_G1 == BOWE
@@ -31,30 +32,7 @@ int G1_check_membership(const ep_t p){
     #else
     return UNDEFINED;
     #endif
-#endif
-    return VALID;
-}
-
-// checks if input point s is on the curve E2 
-// and is in the subgroup G2.
-// 
-// membership check in G2 is using a scalar multiplication by the group order.
-// TODO: switch to the faster Bowe check 
-int G2_check_membership(const E2* p){
-#if MEMBERSHIP_CHECK
-    // check p is on curve
-    if (!E2_affine_on_curve(p))  // TODO: remove and assume inputs are on curve?
-        return INVALID;
-    // check p is in G2
-    #if MEMBERSHIP_CHECK_G2 == EXP_ORDER
-    return G2_simple_subgroup_check(p);
-    #elif MEMBERSHIP_CHECK_G2 == BOWE
-    // TODO: implement Bowe's check
-    return UNDEFINED;
-    #else
-    return UNDEFINED;
-    #endif
-#endif
+*/
     return VALID;
 }
 
@@ -172,7 +150,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     if (ret != RLC_OK) goto out;
 
     // check s is in G1
-    ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
+    ret = E1_in_G1(elemsG1[0]);
     if (ret != VALID) goto out;
 
     // elemsG2[0] = -g2
@@ -260,7 +238,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
     if (ret != RLC_OK) goto out;
 
     // check s in G1
-    ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1
+    ret = E1_in_G1(elemsG1[0]); 
     if (ret != VALID) goto out;
 
     // elemsG2[0] = -g2
@@ -346,7 +324,7 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {
     }
 
     // check s is in G1
-    if (G1_check_membership(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1
+    if (E1_in_G1(s) != VALID) {
         return INVALID;
     }
     
@@ -495,7 +473,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
         int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (read_ret != RLC_OK || G1_check_membership(&sigs[i]) != VALID) {
+        if (read_ret != RLC_OK || E1_in_G1(&sigs[i]) != VALID) {
             if (read_ret == UNDEFINED) {// unexpected error case 
                 goto out;
             };
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index f81f2839bcf..f5a6a53a6f7 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -20,9 +20,6 @@
 #define DOUBLE_PAIRING 1
 #define SINGLE_PAIRING (DOUBLE_PAIRING^1)
 
-// Signature and public key membership check
-#define MEMBERSHIP_CHECK 0  // TODO: switch to 1 and clean up memb check
-
 // algorithm choice for hashing to G1 
 // both methods are similar implementations of the same optimized SSWU 
 // but offer different timings.
diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c
index dc2d2c40a4e..4b0732e06e4 100644
--- a/crypto/blst_src/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -1,7 +1,6 @@
 // +build relic
 
-// keygen.c is not included as it is imported by dkg_core and is not needed
-// by bls12_381_utils
+#include "keygen.c"
 #include "hash_to_field.c"
 #include "e1.c"
 #include "map_to_g1.c"

From 2d90a7089757864dd3ce340ea6e488620b23821d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 20 Apr 2023 18:08:32 -0600
Subject: [PATCH 043/200] update batch verify random coefficients

---
 crypto/bls12381_utils.c |  2 --
 crypto/bls_core.c       | 40 ++++++++++++++++++++++++----------------
 crypto/bls_include.h    |  4 ++--
 crypto/bls_multisig.go  | 41 ++++++++++++++++++++++++-----------------
 crypto/bls_test.go      |  7 +++++--
 5 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 855d61ad7ac..55569075f14 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -113,8 +113,6 @@ prec_st* init_precomputed_data_BLS12_381() {
     return bls_prec;
 }
 
-// ------------------- Utilities
-
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 47e7d270546..528aaef7244 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -357,7 +357,7 @@ static void free_tree(node* root) {
     if (!root) return;
 
     // only free pks and sigs of non-leafs, data of leafs are allocated 
-    // as an entire array in `bls_batchVerify`.
+    // as an entire array in `bls_batch_verify`.
     if (root->left) {   // no need to check the right child for the leaf check because
                         //  the recursive build starts with the left side first
         // relic free 
@@ -413,7 +413,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
 }
 
 // verify the binary tree and fill the results using recursive batch verifications.
-static void bls_batchVerify_tree(const node* root, const int len, byte* results, 
+static void bls_batch_verify_tree(const node* root, const int len, byte* results, 
         const byte* data, const int data_len) {
     // verify the aggregated signature against the aggregated public key.
     int res =  bls_verify_ep(root->pk, root->sig, data, data_len);
@@ -436,21 +436,23 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results,
     // use the binary tree structure to find the invalid signatures. 
     int right_len = len/2;
     int left_len = len - right_len;
-    bls_batchVerify_tree(root->left, left_len, &results[0], data, data_len);
-    bls_batchVerify_tree(root->right, right_len, &results[left_len], data, data_len);
+    bls_batch_verify_tree(root->left, left_len, &results[0], data, data_len);
+    bls_batch_verify_tree(root->right, right_len, &results[left_len], data, data_len);
 }
 
 // Batch verifies the validity of a multiple BLS signatures of the 
 // same message under multiple public keys. Each signature at index `i` is verified
 // against the public key at index `i`.
+// `seed` is used as the entropy source for randoms required by the computation. The function
+// assumes the source size is at least (16*sigs_len) of random bytes of entropy at least 128 bits.
 //
 // - membership checks of all signatures is verified upfront.
 // - use random coefficients for signatures and public keys at the same index to prevent 
 //  indices mixup.
 // - optimize the verification by verifying an aggregated signature against an aggregated
 //  public key, and use a recursive verification to find invalid signatures.  
-void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
-     const byte* sigs_bytes, const byte* data, const int data_len) {  
+void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
+     const byte* sigs_bytes, const byte* data, const int data_len, const byte* seed) {  
     
     // initialize results to undefined
     memset(results, UNDEFINED, sigs_len);
@@ -464,7 +466,6 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
         ep_new(sigs[i]);
         ep2_new(pks[i]);
     }
-    bn_t r; bn_new(r);
 
     for (int i=0; i < sigs_len; i++) {
         // convert the signature points:
@@ -484,14 +485,21 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
             results[i] = INVALID; 
         } else {
             // choose a random non-zero coefficient of at least 128 bits
-            // TODO: find a way to generate randoms
-            bn_rand(r, RLC_POS, SEC_BITS); 
-            bn_add_dig(r, r, 1); 
-            Fr* tmp = Fr_relic_to_blst(r);
-            // multiply public key and signature by the same random exponent
-            E2_mult(&pks[i], &pks_input[i], tmp); 
-            free(tmp);  
-            ep_mul_lwnaf(&sigs[i], &sigs[i], r);   
+            Fr r, one;
+            // r = random, i-th seed is used for i-th signature
+            Fr_set_zero(&r);
+            const int seed_len = SEC_BITS/8;
+            limbs_from_be_bytes((limb_t*)&r, seed + (seed_len*i), seed_len);  // faster shortcut than Fr_map_bytes
+            // r = random + 1
+            Fr_set_limb(&one, 1);
+            Fr_add(&r, &r, &one); 
+            /*char str[20]; sprintf(str, "r-%d", i);
+            Fr_print_(str, &r);*/
+            // multiply public key and signature by the same random exponent r
+            E2_mult(&pks[i], &pks_input[i], &r);  // TODO: faster version for short expos?
+            bn_st* tmp = Fr_blst_to_relic(&r);
+            ep_mul_lwnaf(&sigs[i], &sigs[i], tmp);   
+            free(tmp); 
         } 
     }
     // build a binary tree of aggreagtions
@@ -499,7 +507,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input,
     if (!root) goto out;
 
     // verify the binary tree and fill the results using batch verification
-    bls_batchVerify_tree(root, sigs_len, &results[0], data, data_len);
+    bls_batch_verify_tree(root, sigs_len, &results[0], data, data_len);
     // free the allocated tree 
     free_tree(root);
     
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index f5a6a53a6f7..d0f9120beb2 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -39,7 +39,7 @@ int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const
 int      bls_verifyPerDistinctKey(const byte*, 
                          const int, const E2*, const uint32_t*,
                          const byte*, const uint32_t*);
-void     bls_batchVerify(const int, byte*, const E2*,
-            const byte*, const byte*, const int);
+void     bls_batch_verify(const int, byte*, const E2*,
+            const byte*, const byte*, const int, const byte*);
 
 #endif
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index e6589a60031..ffaf8d637ce 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -4,6 +4,7 @@
 package crypto
 
 import (
+	"crypto/rand"
 	"errors"
 	"fmt"
 
@@ -472,27 +473,27 @@ func VerifyBLSSignatureManyMessages(
 func BatchVerifyBLSSignaturesOneMessage(
 	pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher,
 ) ([]bool, error) {
+	// boolean array returned when errors occur
+	falseSlice := make([]bool, len(sigs))
+
 	// empty list check
 	if len(pks) == 0 {
-		return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError)
+		return falseSlice, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError)
 	}
 
 	if len(pks) != len(sigs) {
-		return []bool{}, invalidInputsErrorf(
+		return falseSlice, invalidInputsErrorf(
 			"keys length %d and signatures length %d are mismatching",
 			len(pks),
 			len(sigs))
 	}
 
-	// return boolean array
 	returnBool := make([]bool, len(sigs))
-	// temporary boolean array to hold the return values till all the return values are set
-	tmpBool := make([]bool, len(sigs))
-	for i := range tmpBool {
-		tmpBool[i] = true // default to true
+	for i := range returnBool {
+		returnBool[i] = true // default to true
 	}
 	if err := checkBLSHasher(kmac); err != nil {
-		return returnBool, err
+		return falseSlice, err
 	}
 
 	// flatten the shares (required by the C layer)
@@ -507,14 +508,14 @@ func BatchVerifyBLSSignaturesOneMessage(
 	for i, pk := range pks {
 		pkBLS, ok := pk.(*pubKeyBLSBLS12381)
 		if !ok {
-			return returnBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError)
+			return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError)
 		}
 
 		if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity {
 			// case of invalid signature: set the signature and public key at index `i`
 			// to identities so that there is no effect on the aggregation tree computation.
 			// However, the boolean return for index `i` is set to `false` and won't be overwritten.
-			tmpBool[i] = false
+			returnBool[i] = false
 			pkPoints = append(pkPoints, getIdentityPoint())
 			flatSigs = append(flatSigs, identityBLSSignature...)
 		} else {
@@ -525,28 +526,34 @@ func BatchVerifyBLSSignaturesOneMessage(
 
 	// hash the input to 128 bytes
 	h := kmac.ComputeHash(message)
-	verifInt := make([]byte, len(returnBool))
+	verifInt := make([]byte, len(sigs))
+	// internal non-determministic entropy source required by bls_batch_verify
+	// specific length of the seed is required by bls_batch_verify.
+	seed := make([]byte, (securityBits/8)*len(verifInt))
+	_, err := rand.Read(seed)
+	if err != nil {
+		return falseSlice, fmt.Errorf("generating randoms failed: %w", err)
+	}
 
-	C.bls_batchVerify(
+	C.bls_batch_verify(
 		(C.int)(len(verifInt)),
 		(*C.uchar)(&verifInt[0]),
 		(*C.E2)(&pkPoints[0]),
 		(*C.uchar)(&flatSigs[0]),
 		(*C.uchar)(&h[0]),
 		(C.int)(len(h)),
+		(*C.uchar)(&seed[0]),
 	)
 
 	for i, v := range verifInt {
 		if (C.int)(v) != valid && (C.int)(v) != invalid {
-			return returnBool, fmt.Errorf("batch verification failed")
+			return falseSlice, fmt.Errorf("batch verification failed")
 		}
-		if tmpBool[i] { // only overwrite if not previously written
-			tmpBool[i] = ((C.int)(v) == valid)
+		if returnBool[i] { // only overwrite if not previously set to false
+			returnBool[i] = ((C.int)(v) == valid)
 		}
 	}
 
-	// make sure returnBool is []false till this point
-	copy(returnBool, tmpBool)
 	return returnBool, nil
 }
 
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 703ec9784b8..d6c849f2feb 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -725,16 +725,19 @@ func TestBLSBatchVerify(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac)
 		require.Error(t, err)
 		assert.True(t, IsBLSAggregateEmptyListError(err))
-		assert.Equal(t, valid, []bool{},
+		assert.Equal(t, valid, expectedValid[:0],
 			"verification should fail with empty list key, got %v", valid)
 	})
 
 	// test incorrect inputs
 	t.Run("inconsistent inputs", func(t *testing.T) {
+		for i := 0; i < sigsNum; i++ {
+			expectedValid[i] = false
+		}
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac)
 		require.Error(t, err)
 		assert.True(t, IsInvalidInputsError(err))
-		assert.Equal(t, valid, []bool{},
+		assert.Equal(t, valid, expectedValid,
 			"verification should fail with incorrect input lenghts, got %v", valid)
 	})
 

From 183627b770db6e5ff8afc8fe2fd56cab9800de35 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 20 Apr 2023 18:45:02 -0600
Subject: [PATCH 044/200] minor cleanup

---
 crypto/bls12381_utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 033bcfcb20f..0ca3e8d48a2 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -112,7 +112,7 @@ func (p *pointE2) isInfinity() bool {
 func randFr(x *scalar, rand random.Rand) bool {
 	// use extra 128 bits to reduce the modular reduction bias
 	bytes := make([]byte, frBytesLen+securityBits/8)
-	rand.Read(bytes) // checking one output is enough
+	rand.Read(bytes)
 	// modular reduction
 	return mapToFr(x, bytes)
 }

From c033600bad1eba99bd261e8218a4306426ce275d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 20 Apr 2023 21:07:31 -0600
Subject: [PATCH 045/200] fix linter

---
 crypto/bls12381_utils.go | 3 ++-
 crypto/common.go         | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 0ca3e8d48a2..7c46b8a20a3 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -121,7 +121,8 @@ func randFr(x *scalar, rand random.Rand) bool {
 // and saves the random in `x`.
 func randFrStar(x *scalar, rand random.Rand) {
 	isZero := true
-	// exteremely unlikely this loop runs more than once
+	// exteremely unlikely this loop runs more than once,
+	// but force the output to be non-zero instead of propagating an error.
 	for isZero {
 		isZero = randFr(x, rand)
 	}
diff --git a/crypto/common.go b/crypto/common.go
index f476de92e3f..7e460cbf6d2 100644
--- a/crypto/common.go
+++ b/crypto/common.go
@@ -21,9 +21,6 @@ const (
 	// it is still recommened that seed is generated using a secure RNG.
 	KeyGenSeedMinLen = 2 * (securityBits / 8)
 	KeyGenSeedMaxLen = 256
-
-	// max relic PRG seed length in bytes
-	maxRelicPrgSeed = 1 << 32
 )
 
 // TODO: update this code to make sure

From 0a2e943e1a7ae10f66627ee7529238347cd38d53 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 24 Apr 2023 17:38:27 -0600
Subject: [PATCH 046/200] add replaces to test using latest crypto package

---
 go.mod             | 2 ++
 insecure/go.mod    | 2 ++
 integration/go.mod | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/go.mod b/go.mod
index 21a9faa6018..d808194e99f 100644
--- a/go.mod
+++ b/go.mod
@@ -278,3 +278,5 @@ require (
 	lukechampine.com/blake3 v1.1.7 // indirect
 	nhooyr.io/websocket v1.8.6 // indirect
 )
+
+replace github.com/onflow/flow-go/crypto => ./crypto
diff --git a/insecure/go.mod b/insecure/go.mod
index 1c74525425e..a76a0fe92db 100644
--- a/insecure/go.mod
+++ b/insecure/go.mod
@@ -269,3 +269,5 @@ require (
 )
 
 replace github.com/onflow/flow-go => ../
+
+replace github.com/onflow/flow-go/crypto => ../crypto
diff --git a/integration/go.mod b/integration/go.mod
index b1ae92ab43b..0261ce32dd4 100644
--- a/integration/go.mod
+++ b/integration/go.mod
@@ -325,3 +325,5 @@ require (
 replace github.com/onflow/flow-go => ../
 
 replace github.com/onflow/flow-go/insecure => ../insecure
+
+replace github.com/onflow/flow-go/crypto => ../crypto

From 7bd182aacfdec08219649482e7bfba2e028845e7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 24 Apr 2023 18:08:18 -0600
Subject: [PATCH 047/200] temp update to makefile to setup crypto with replace
 statement

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b465aad4e31..8719bc21dce 100644
--- a/Makefile
+++ b/Makefile
@@ -43,9 +43,12 @@ export CONTAINER_REGISTRY := gcr.io/flow-container-registry
 export DOCKER_BUILDKIT := 1
 
 # setup the crypto package under the GOPATH: needed to test packages importing flow-go/crypto
+# TODO: replace by bash crypto_setup.sh after removing replace statements
 .PHONY: crypto_setup_gopath
 crypto_setup_gopath:
-	bash crypto_setup.sh
+	(cd ./crypto && make setup)
+	
+
 
 cmd/collection/collection:
 	go build -o cmd/collection/collection cmd/collection/main.go

From 9ae8df2473230b6c389bc1645bb3e668771fcf3e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 24 Apr 2023 18:19:52 -0600
Subject: [PATCH 048/200] mod tidy

---
 go.sum             |  3 ---
 insecure/go.mod    |  1 +
 insecure/go.sum    | 19 ++++++++++++++++---
 integration/go.mod |  1 +
 integration/go.sum |  4 +---
 5 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/go.sum b/go.sum
index 79d22d8b924..b5ddfc7ecfd 100644
--- a/go.sum
+++ b/go.sum
@@ -1236,8 +1236,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx
 github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU=
 github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw=
 github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM=
-github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0=
-github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d h1:Wl8bE1YeZEcRNnCpxw2rikOEaivuYKDrnJd2vsfIWoA=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk=
 github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA=
@@ -1477,7 +1475,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs=
 github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo=
-github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk=
 github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
 github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
diff --git a/insecure/go.mod b/insecure/go.mod
index a76a0fe92db..dae2503f3b6 100644
--- a/insecure/go.mod
+++ b/insecure/go.mod
@@ -257,6 +257,7 @@ require (
 	golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
 	golang.org/x/tools v0.6.0 // indirect
 	golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
+	gonum.org/v1/gonum v0.8.2 // indirect
 	google.golang.org/api v0.114.0 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect
diff --git a/insecure/go.sum b/insecure/go.sum
index 598f99e4cdb..d4214a1cbdd 100644
--- a/insecure/go.sum
+++ b/insecure/go.sum
@@ -85,6 +85,7 @@ github.com/VictoriaMetrics/fastcache v1.5.3/go.mod h1:+jv9Ckb+za/P1ZRg/sulP5Ni1v
 github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g=
 github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=
 github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c=
+github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
@@ -304,6 +305,7 @@ github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI
 github.com/flynn/noise v0.0.0-20180327030543-2492fe189ae6/go.mod h1:1i71OnUq3iUe1ma7Lr6yG6/rjvM3emb6yoL7xLFzcVQ=
 github.com/flynn/noise v1.0.0 h1:DlTHqmzmvcEiKj+4RYo/imoswx/4r6iBlCMfVtrMXpQ=
 github.com/flynn/noise v1.0.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag=
+github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
 github.com/francoispqt/gojay v1.2.13 h1:d2m3sFjloqoIUQU3TsHBgj6qg/BVGlTBeHDUmyJnXKk=
 github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiDsoyrBGkyDY=
 github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4=
@@ -391,6 +393,7 @@ github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ=
 github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4=
@@ -723,6 +726,7 @@ github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfV
 github.com/julienschmidt/httprouter v1.1.1-0.20170430222011-975b5c4c7c21/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
+github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
 github.com/kami-zh/go-capturer v0.0.0-20171211120116-e492ea43421d/go.mod h1:P2viExyCEfeWGU259JnaQ34Inuec4R38JCyBx2edgD0=
 github.com/karalabe/usb v0.0.0-20190919080040-51dc0efba356/go.mod h1:Od972xHfMJowv7NGVDiWVxk2zxnWgjLlJzE+F4F7AGU=
@@ -1184,8 +1188,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx
 github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU=
 github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw=
 github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM=
-github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0=
-github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d h1:Wl8bE1YeZEcRNnCpxw2rikOEaivuYKDrnJd2vsfIWoA=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk=
 github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA=
@@ -1423,7 +1425,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs=
 github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo=
-github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk=
 github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
 github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
@@ -1590,7 +1591,10 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
 golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.4.0 h1:UVQgzMY87xqpKNgb+kDsll2Igd33HszWHFLmpaRMq/8=
 golang.org/x/crypto v0.4.0/go.mod h1:3quD/ATkf6oY+rnes5c3ExXTbLc8mueNue5/DoinL80=
+golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
@@ -1603,6 +1607,7 @@ golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMk
 golang.org/x/exp v0.0.0-20200331195152-e8c3332aa8e5/go.mod h1:4M0jN8W1tt0AVLNr8HDosyJCDCDuyL9N9+3m7wDWgKw=
 golang.org/x/exp v0.0.0-20221217163422-3c43f8badb15 h1:5oN1Pz/eDhCpbMbLstvIPa0b/BEQo6g6nwV3pLjfM6w=
 golang.org/x/exp v0.0.0-20221217163422-3c43f8badb15/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
+golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@@ -1833,12 +1838,14 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb
 golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac h1:7zkz7BUtwNFFqcowJ+RIgu2MaV/MapERkDIy+mwPyjs=
 golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030000716-a0a13e073c7b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181130052023-1c3d964395ce/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
@@ -1904,7 +1911,12 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
 golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
 golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
+gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
+gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
 google.golang.org/api v0.0.0-20180910000450-7ca32eb868bf/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
 google.golang.org/api v0.0.0-20181030000543-1d582fd0359e/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
 google.golang.org/api v0.1.0/go.mod h1:UGEZY7KEX120AnNLIHFMKIo4obdJhkp2tPbaPlQx13Y=
@@ -2090,6 +2102,7 @@ nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k=
 nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
 pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
+rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
 rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
 sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o=
diff --git a/integration/go.mod b/integration/go.mod
index 0261ce32dd4..6487fe8f906 100644
--- a/integration/go.mod
+++ b/integration/go.mod
@@ -307,6 +307,7 @@ require (
 	golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
 	golang.org/x/tools v0.6.0 // indirect
 	golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
+	gonum.org/v1/gonum v0.11.0 // indirect
 	google.golang.org/api v0.114.0 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect
diff --git a/integration/go.sum b/integration/go.sum
index 35c6fbd3bef..4870a501c95 100644
--- a/integration/go.sum
+++ b/integration/go.sum
@@ -1316,8 +1316,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx
 github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU=
 github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw=
 github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM=
-github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0=
-github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230407005012-727d541fd5f8 h1:O8uM6GVVMhRwBtYaGl93+tDSu6vWqUc47b12fPkZGXk=
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230407005012-727d541fd5f8/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk=
 github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA=
@@ -1601,7 +1599,6 @@ github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o
 github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw=
 github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs=
 github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo=
-github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk=
 github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
 github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
 github.com/syndtr/goleveldb v1.0.1-0.20200815110645-5c35d600f0ca/go.mod h1:u2MKkTVTVJWe5D1rCvame8WqhBd88EuIwODJZ1VHCPM=
@@ -2141,6 +2138,7 @@ gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJ
 gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 gonum.org/v1/gonum v0.6.0/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU=
 gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
+gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
 gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
 gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
 gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=

From 2fb816c18eaecc6c684ec5016fa7237a1ae042a6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 25 Apr 2023 15:07:11 -0600
Subject: [PATCH 049/200] enable membership check in G2 to fix FVM test

---
 crypto/bls_core.c         |  4 +++-
 crypto/bls_include.h      |  2 +-
 crypto/dkg_core.c         |  2 +-
 fvm/crypto/crypto_test.go | 15 +++++++--------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 097e1595d44..e29d3401d69 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -47,7 +47,9 @@ int G2_check_membership(const E2* p){
         return INVALID;
     // check p is in G2
     #if MEMBERSHIP_CHECK_G2 == EXP_ORDER
-    return G2_simple_subgroup_check(p);
+    // TODO: clean up
+    ep2_st* tmp = E2_blst_to_relic(p);
+    return G2_simple_subgroup_check(tmp);
     #elif MEMBERSHIP_CHECK_G2 == BOWE
     // TODO: implement Bowe's check
     return UNDEFINED;
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index f81f2839bcf..7a2572a2fc4 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -21,7 +21,7 @@
 #define SINGLE_PAIRING (DOUBLE_PAIRING^1)
 
 // Signature and public key membership check
-#define MEMBERSHIP_CHECK 0  // TODO: switch to 1 and clean up memb check
+#define MEMBERSHIP_CHECK 1
 
 // algorithm choice for hashing to G1 
 // both methods are similar implementations of the same optimized SSWU 
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index aedf5d83164..d5f39976090 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -79,7 +79,7 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
             return read_ret;
         p += G2_SER_BYTES;
     }
-    // TODO: add G2 subgroup check
+    // TODO: add G2 subgroup check?
     return BLST_SUCCESS;
 }
 
diff --git a/fvm/crypto/crypto_test.go b/fvm/crypto/crypto_test.go
index fe6c400c1b4..ffbdec3a730 100644
--- a/fvm/crypto/crypto_test.go
+++ b/fvm/crypto/crypto_test.go
@@ -425,16 +425,13 @@ func TestVerifySignatureFromTransaction(t *testing.T) {
 
 func TestValidatePublicKey(t *testing.T) {
 
-	// make sure the seed length is larger than miniumum seed lengths of all signature schemes
-	seedLength := 64
-
 	validPublicKey := func(t *testing.T, s runtime.SignatureAlgorithm) []byte {
-		seed := make([]byte, seedLength)
+		seed := make([]byte, gocrypto.KeyGenSeedMinLen)
 		_, err := rand.Read(seed)
 		require.NoError(t, err)
-		pk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed)
+		sk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed)
 		require.NoError(t, err)
-		return pk.PublicKey().Encode()
+		return sk.PublicKey().Encode()
 	}
 
 	t.Run("Unknown algorithm should return false", func(t *testing.T) {
@@ -463,12 +460,14 @@ func TestValidatePublicKey(t *testing.T) {
 			runtime.SignatureAlgorithmBLS_BLS12_381,
 		}
 		for i, s := range signatureAlgos {
+
 			t.Run(fmt.Sprintf("case %v: %v", i, s), func(t *testing.T) {
 				key := validPublicKey(t, s)
+				// This may cause flakiness depending on the public key
+				// deserialization scheme used!!
 				key[0] ^= 1 // alter one bit of the valid key
-
 				err := crypto.ValidatePublicKey(s, key)
-				require.Error(t, err)
+				require.Errorf(t, err, "key is %#x", key)
 			})
 		}
 	})

From 54b92e63c3f2bde9407a6b0fb936ec0697b33dbd Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 25 Apr 2023 19:02:10 -0600
Subject: [PATCH 050/200] fix E2 infinity set and check to be based on
 projective Z

---
 crypto/bls.go            |  2 +-
 crypto/bls12381_utils.c  |  6 ++++--
 crypto/bls12381_utils.go |  4 +---
 crypto/bls_core.c        |  4 ++--
 crypto/bls_test.go       | 41 ++++++++++++++++++++++++++++++++++++----
 5 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index d45ea7f3aeb..a2d372aca25 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -227,7 +227,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 }
 
 // 0xC0 is the header of the point at infinity serialization (either in G1 or G2)
-const infinityPointHeader = 0xC0
+const infinityPointHeader = byte(0xC0)
 
 var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, signatureLengthBLSBLS12381-1)...)
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 9b91e8e0ebd..d722531ec65 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -950,12 +950,14 @@ void E2_write_bytes(byte *bin, const E2* a) {
 
 // set p to infinity
 void E2_set_infty(E2* p) {
-    vec_zero(p, sizeof(E2));
+    // BLST infinity points are defined by Z=0
+    vec_zero(p->z, sizeof(p->z));  
 }
 
 // check if `p` is infinity
 bool_t E2_is_infty(const E2* p) {
-    return vec_is_zero(p, sizeof(E2));
+    // BLST infinity points are defined by Z=0
+    return vec_is_zero(p->z, sizeof(p->z));
 }
 
 // checks affine point `p` is in E2
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 59776fcec5b..56c63700753 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -166,9 +166,7 @@ func writeScalar(dest []byte, x *scalar) {
 // The slice should be of size PubKeyLenBLSBLS12381 and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointG2(dest []byte, a *pointE2) {
-	C.E2_write_bytes((*C.uchar)(&dest[0]),
-		(*C.E2)(a),
-	)
+	C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a))
 }
 
 // writePointG1 writes a G1 point in a slice of bytes
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index e29d3401d69..d92b4e992e6 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -86,7 +86,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 // and a message data.
 // The signature and public key are assumed to be in G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) {     
+static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) {    
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -137,7 +137,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int
             goto out;
         }
     }
-    
+
 out:
     ep_free(elemsG1[0]);
     ep_free(elemsG1[1]);
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 260c9295994..e0fb9f29460 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -186,13 +186,16 @@ func TestBLSEncodeDecode(t *testing.T) {
 	assert.True(t, IsInvalidInputsError(err))
 	assert.Nil(t, sk)
 
-	//  identity public key
+	//  decode an identity public key
 	pkBytes := make([]byte, PubKeyLenBLSBLS12381)
 	pkBytes[0] = infinityPointHeader
 	pk, err := DecodePublicKey(BLSBLS12381, pkBytes)
 	require.NoError(t, err, "decoding identity public key should succeed")
 	assert.True(t, pk.Equals(IdentityBLSPublicKey()))
 
+	// encode an identity public key
+	assert.Equal(t, pk.Encode(), pkBytes)
+
 	// invalid point
 	pkBytes = make([]byte, PubKeyLenBLSBLS12381)
 	pkBytes[0] = invalidBLSSignatureHeader
@@ -436,7 +439,7 @@ func TestBLSAggregateSignatures(t *testing.T) {
 // Aggregate n public keys and their respective private keys and compare
 // the public key of the aggregated private key is equal to the aggregated
 // public key
-func TestBLSAggregatePubKeys(t *testing.T) {
+func TestBLSAggregatePublicKeys(t *testing.T) {
 	rand := getPRG(t)
 	// number of keys to aggregate
 	pkNum := rand.Intn(100) + 1
@@ -507,8 +510,8 @@ func TestBLSAggregatePubKeys(t *testing.T) {
 
 	// check that the public key corresponding to the zero private key is indeed identity
 	// The package doesn't allow to generate a zero private key. One way to obtain a zero
-	// private key is via aggrgeting opposite private keys
-	t.Run("public key of zero private key", func(t *testing.T) {
+	// private key is via aggregating opposite private keys
+	t.Run("Identity public key from identity private key", func(t *testing.T) {
 		// sk1 is group order of bls12-381 minus one
 		groupOrderMinus1 := []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39,
 			0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE,
@@ -520,9 +523,39 @@ func TestBLSAggregatePubKeys(t *testing.T) {
 		one[PrKeyLenBLSBLS12381-1] = 1
 		sk2, err := DecodePrivateKey(BLSBLS12381, one)
 		require.NoError(t, err)
+		// public key of aggregated private keys
 		aggSK, err := AggregateBLSPrivateKeys([]PrivateKey{sk1, sk2})
 		require.NoError(t, err)
 		assert.True(t, aggSK.PublicKey().Equals(IdentityBLSPublicKey()))
+		// aggregated public keys
+		aggPK, err := AggregateBLSPublicKeys([]PublicKey{sk1.PublicKey(), sk2.PublicKey()})
+		require.NoError(t, err)
+		assert.True(t, aggPK.Equals(IdentityBLSPublicKey()))
+		// check of internal identity flag
+		blsKey, ok := aggPK.(*pubKeyBLSBLS12381)
+		require.True(t, ok)
+		assert.True(t, blsKey.isIdentity)
+		// check of encoding header
+		pkBytes := aggPK.Encode()
+		assert.Equal(t, infinityPointHeader, pkBytes[0])
+	})
+
+	t.Run("Identity public key from opposite points", func(t *testing.T) {
+		pkBytes := pks[0].Encode()
+		negatePoint(pkBytes)
+		minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes)
+		require.NoError(t, err)
+		// aggregated public keys
+		aggPK, err := AggregateBLSPublicKeys([]PublicKey{pks[0], minusPk})
+		require.NoError(t, err)
+		assert.True(t, aggPK.Equals(IdentityBLSPublicKey()))
+		// check of internal identity flag
+		blsKey, ok := aggPK.(*pubKeyBLSBLS12381)
+		require.True(t, ok)
+		assert.True(t, blsKey.isIdentity)
+		// check of encoding header
+		pkBytes = aggPK.Encode()
+		assert.Equal(t, infinityPointHeader, pkBytes[0])
 	})
 }
 

From f387ce90839937773dfd550f4638a0dd332602e4 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 26 Apr 2023 14:07:41 -0600
Subject: [PATCH 051/200] fix warning

---
 crypto/bls12381_utils.c  | 26 +++++++++++++-------------
 crypto/bls12381_utils.go |  2 +-
 crypto/bls12381_utils.h  | 10 +++++-----
 crypto/dkg_test.go       | 15 ++++++++-------
 4 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index d722531ec65..64efee4bcfc 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -154,17 +154,17 @@ Fr* Fr_relic_to_blst(const bn_st* x){
 
 // returns true if a == 0 and false otherwise
 bool_t Fr_is_zero(const Fr* a) {
-    return bytes_are_zero((const byte*)a, Fr_BYTES);
+    return bytes_are_zero((const byte*)a, sizeof(Fr));
 }
 
 // returns true if a == b and false otherwise
 bool_t Fr_is_equal(const Fr* a, const Fr* b) {
-    return vec_is_equal(a, b, Fr_BYTES);
+    return vec_is_equal(a, b, sizeof(Fr));
 }
 
 // sets `a` to limb `l`
 void Fr_set_limb(Fr* a, const limb_t l){
-    vec_zero((byte*)a + sizeof(limb_t), Fr_BYTES - sizeof(limb_t));
+    vec_zero((byte*)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t));
     *((limb_t*)a) = l;
 }
 
@@ -304,7 +304,7 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) {
 //    - BLST_BAD_ENCODING if the length is invalid
 //    - BLST_BAD_SCALAR if the scalar isn't in Fr
 //    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
+BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
     if (len != Fr_BYTES) {
         return BLST_BAD_ENCODING;
     }
@@ -325,7 +325,7 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) {
 //    - BLST_BAD_ENCODING if the length is invalid
 //    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
 //    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
+BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) {
     int ret = Fr_read_bytes(a, bin, len);
     if (ret != BLST_SUCCESS) {
         return ret;
@@ -338,28 +338,28 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) {
 }
 
 // write Fr element `a` in big endian bytes.
-void Fr_write_bytes(uint8_t *bin, const Fr* a) {
+void Fr_write_bytes(byte *bin, const Fr* a) {
     be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
 }
 
 // maps big-endian bytes into an Fr element using modular reduction
 // Input is byte-big-endian, output is vec256 (also used as Fr)
-static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n)
+static void vec256_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 {
     Fr digit, radix;
     Fr_set_zero(out);
     Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2
 
-    bytes += n;
+    byte* p = bytes + n;
     while (n > Fr_BYTES) {
-        limbs_from_be_bytes((limb_t*)&digit, bytes -= Fr_BYTES, Fr_BYTES); // l_i
+        limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
         Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i  (i is the loop number starting at 1)
         Fr_add(out, out, &digit);
         Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1)
         n -= Fr_BYTES;
     }
     Fr_set_zero(&digit);
-    limbs_from_be_bytes((limb_t*)&digit, bytes -= n, n);
+    limbs_from_be_bytes((limb_t*)&digit, p - n, n);
     Fr_mul_montg(&digit, &digit, &radix);
     Fr_add(out, out, &digit);
     // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n
@@ -504,8 +504,8 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
-static byte Fp_get_sign(const fp_t y) {
-    return sgn0_pty_mont_384(y, BLS12_381_P, p0);
+static byte Fp_get_sign(const Fp* y) {
+    return sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0);
 }
 
 // ------------------- Fp^2 utilities
@@ -1303,7 +1303,7 @@ void ep2_rand_G2complement(ep2_t p) {
 
 // This is a testing function.
 // It wraps a call to a Relic macro since cgo can't call macros.
-void xmd_sha256(uint8_t *hash, int len_hash, uint8_t *msg, int len_msg, uint8_t *dst, int len_dst){
+void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){
     md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst);
 }
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 56c63700753..2c5da2495f4 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -7,7 +7,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -fsanitize=thread -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 01f68610603..ecdc0ada0fe 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -113,10 +113,10 @@ void        Fr_from_montg(Fr *res, const Fr *a);
 void        Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len);
 void        Fr_inv_montg_eucl(Fr *res, const Fr *a);
 void        Fr_inv_exp_montg(Fr *res, const Fr *a);
-BLST_ERROR  Fr_read_bytes(Fr* a, const uint8_t *bin, int len);
-BLST_ERROR  Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len);
-void        Fr_write_bytes(uint8_t *bin, const Fr* a);
-bool        map_bytes_to_Fr(Fr*, const uint8_t*, int);
+BLST_ERROR  Fr_read_bytes(Fr* a, const byte *bin, int len);
+BLST_ERROR  Fr_star_read_bytes(Fr* a, const byte *bin, int len);
+void        Fr_write_bytes(byte *bin, const Fr* a);
+bool        map_bytes_to_Fr(Fr*, const byte*, int);
 
 // Fp utilities
 
@@ -166,7 +166,7 @@ void     precomputed_data_set(const prec_st* p);
 void     seed_relic(byte*, int);
 
 // utility testing function
-void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int);
+void xmd_sha256(byte *, int, byte *, int, byte *, int);
 
 // Debugging related functions
 void     bytes_print_(char*, byte*, int);
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index 0329eb453ea..b2d55e6bf18 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -104,17 +104,18 @@ func testJointFeldman(t *testing.T) {
 	n := 4
 	var threshold int
 	// happy path, test multiple values of thresold
-	for threshold = MinimumThreshold; threshold < n; threshold++ {
-		t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
-			dkgCommonTest(t, jointFeldman, n, threshold, happyPath)
-		})
-	}
+	//for threshold = MinimumThreshold; threshold < n; threshold++ {
+	threshold = optimalThreshold(n)
+	t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
+		dkgCommonTest(t, jointFeldman, n, threshold, happyPath)
+	})
+	//}
 
 	// unhappy path, with focus on the optimal threshold value
 	n = 5
 	threshold = optimalThreshold(n)
 	// unhappy path, with invalid shares
-	t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
+	/*t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, invalidShares)
 	})
 	// unhappy path, with invalid vector
@@ -132,7 +133,7 @@ func testJointFeldman(t *testing.T) {
 	// unhappy path, with duplicated messages (all types)
 	t.Run(fmt.Sprintf("JointFeldman_DuplicatedMessages_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages)
-	})
+	})*/
 }
 
 // Supported Key Generation protocols

From cbe51a372af1605e699b62dfdd10d1b0a67069d1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 26 Apr 2023 14:14:58 -0600
Subject: [PATCH 052/200] disable thread SAN

---
 crypto/bls12381_utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 2c5da2495f4..56c63700753 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -7,7 +7,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -fsanitize=thread -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__

From be727d855fb5e28d94821e1a0cc9aca2e704c358 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 26 Apr 2023 14:41:25 -0600
Subject: [PATCH 053/200] add SIGILL handler

---
 crypto/bls12381_utils.go | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 56c63700753..be5991fb0e9 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -12,6 +12,25 @@ package crypto
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
+//
+// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__))
+// # include <signal.h>
+// # include <unistd.h>
+// static void handler(int signum)
+// {   ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, "
+//                          "consult <blst>/bindings/go/README.md.\n", 70);
+//     _exit(128+SIGILL);
+//     (void)n;
+// }
+// __attribute__((constructor)) static void blst_cgo_init()
+// {   blst_fp temp = { 0 };
+//     struct sigaction act = { handler }, oact;
+//     sigaction(SIGILL, &act, &oact);
+//     blst_fp_sqr(&temp, &temp);
+//     sigaction(SIGILL, &oact, NULL);
+// }
+// #endif
+//
 import "C"
 import (
 	"crypto/rand"

From 6bd85a79a83393fe77954c2e5ded7bd8697b6f68 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 26 Apr 2023 16:37:03 -0600
Subject: [PATCH 054/200] fix blst_cgo_init

---
 crypto/bls12381_utils.c  | 2 +-
 crypto/bls12381_utils.go | 6 +++---
 crypto/bls12381_utils.h  | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 64efee4bcfc..b9ec974fee3 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -350,7 +350,7 @@ static void vec256_from_be_bytes(Fr* out, const byte *bytes, size_t n)
     Fr_set_zero(out);
     Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2
 
-    byte* p = bytes + n;
+    byte* p = (byte*)bytes + n;
     while (n > Fr_BYTES) {
         limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
         Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i  (i is the loop number starting at 1)
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index be5991fb0e9..52a0dde0248 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -23,10 +23,10 @@ package crypto
 //     (void)n;
 // }
 // __attribute__((constructor)) static void blst_cgo_init()
-// {   blst_fp temp = { 0 };
-//     struct sigaction act = { handler }, oact;
+// {   Fp temp = { 0 };
+//     struct sigaction act = {{ handler }}, oact;
 //     sigaction(SIGILL, &act, &oact);
-//     blst_fp_sqr(&temp, &temp);
+//     Fp_squ_montg(&temp, &temp);
 //     sigaction(SIGILL, &oact, NULL);
 // }
 // #endif
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index ecdc0ada0fe..ca69b584201 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -119,6 +119,8 @@ void        Fr_write_bytes(byte *bin, const Fr* a);
 bool        map_bytes_to_Fr(Fr*, const byte*, int);
 
 // Fp utilities
+void    Fp_mul_montg(Fp *, const Fp *, const Fp *);
+void    Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
 int      ep_read_bin_compact(ep_t, const byte *, const int);

From 76fcc73fcb6a0d58b55012edce7aedf28aaa395c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 26 Apr 2023 21:01:39 -0600
Subject: [PATCH 055/200] disable ADX instructions in BlST by default as a temp
 measure

---
 crypto/Makefile          | 2 +-
 crypto/bls12381_utils.go | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index c66774e1033..d87f27c440f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -22,7 +22,7 @@ relic_tests:
 ifeq ($(ADX_SUPPORT), 1)
 	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,)
 else
-	CGO_CFLAGS="-D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,)
+	CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,)
 endif
 
 # test all packages that do not require Relic library (all functionalities except the BLS-related ones)
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 52a0dde0248..38e012a1510 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -7,7 +7,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
@@ -17,12 +17,12 @@ package crypto
 // # include <signal.h>
 // # include <unistd.h>
 // static void handler(int signum)
-// {   ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, "
-//                          "consult <blst>/bindings/go/README.md.\n", 70);
+// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__";
+//		ssize_t n = write(2, &text, strlen(text));
 //     _exit(128+SIGILL);
 //     (void)n;
 // }
-// __attribute__((constructor)) static void blst_cgo_init()
+// __attribute__((constructor)) static void flow_crypto_cgo_init()
 // {   Fp temp = { 0 };
 //     struct sigaction act = {{ handler }}, oact;
 //     sigaction(SIGILL, &act, &oact);

From 2a851b5551cb775af5b2db0a2c19dad7d72c8c1e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 28 Apr 2023 14:45:43 -0600
Subject: [PATCH 056/200] uncomment DKG tests

---
 crypto/dkg_test.go | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index b2d55e6bf18..0329eb453ea 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -104,18 +104,17 @@ func testJointFeldman(t *testing.T) {
 	n := 4
 	var threshold int
 	// happy path, test multiple values of thresold
-	//for threshold = MinimumThreshold; threshold < n; threshold++ {
-	threshold = optimalThreshold(n)
-	t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
-		dkgCommonTest(t, jointFeldman, n, threshold, happyPath)
-	})
-	//}
+	for threshold = MinimumThreshold; threshold < n; threshold++ {
+		t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
+			dkgCommonTest(t, jointFeldman, n, threshold, happyPath)
+		})
+	}
 
 	// unhappy path, with focus on the optimal threshold value
 	n = 5
 	threshold = optimalThreshold(n)
 	// unhappy path, with invalid shares
-	/*t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
+	t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, invalidShares)
 	})
 	// unhappy path, with invalid vector
@@ -133,7 +132,7 @@ func testJointFeldman(t *testing.T) {
 	// unhappy path, with duplicated messages (all types)
 	t.Run(fmt.Sprintf("JointFeldman_DuplicatedMessages_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) {
 		dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages)
-	})*/
+	})
 }
 
 // Supported Key Generation protocols

From 26e56364f9a929a90879d9dc782916b6f1e4b12d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 28 Apr 2023 20:09:24 -0600
Subject: [PATCH 057/200] more logging in FeldmanVSSQ when shares aren't
 matching computed keys from verif vector

---
 crypto/bls.go              |  4 ++--
 crypto/bls12381_utils.go   | 13 +++++++++++++
 crypto/dkg_feldmanvssq.go  | 24 ++++++++++++++++--------
 crypto/dkg_jointfeldman.go |  2 +-
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index a2d372aca25..c3a413b6443 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -444,7 +444,7 @@ func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool {
 
 // String returns the hex string representation of the key.
 func (sk *prKeyBLSBLS12381) String() string {
-	return fmt.Sprintf("%#x", sk.Encode())
+	return sk.scalar.String()
 }
 
 // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381,
@@ -520,7 +520,7 @@ func (pk *pubKeyBLSBLS12381) Equals(other PublicKey) bool {
 
 // String returns the hex string representation of the key.
 func (pk *pubKeyBLSBLS12381) String() string {
-	return fmt.Sprintf("%#x", pk.Encode())
+	return pk.point.String()
 }
 
 // Get Macro definitions from the C layer as Cgo does not export macros
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 38e012a1510..735e1ffc00e 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -35,6 +35,7 @@ import "C"
 import (
 	"crypto/rand"
 	"errors"
+	"fmt"
 )
 
 // Go wrappers around BLST C types
@@ -69,6 +70,18 @@ var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING)
 var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR)
 var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE)
 
+func (a *scalar) String() string {
+	encoding := make([]byte, frBytesLen)
+	writeScalar(encoding, a)
+	return fmt.Sprintf("%#x", encoding)
+}
+
+func (p *pointE2) String() string {
+	encoding := make([]byte, pubKeyLengthBLSBLS12381)
+	writePointG2(encoding, p)
+	return fmt.Sprintf("%#x", encoding)
+}
+
 // initContext sets relic B12_381 parameters and precomputes some data in the C layer
 func (ct *ctx) initContext() error {
 	c := C.relic_init_BLS12_381()
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 38b3667ffae..5a10a210949 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -162,7 +162,7 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error)
 			if c.received && !c.answerReceived {
 				s.disqualified = true
 				s.processor.Disqualify(int(s.dealerIndex),
-					fmt.Sprintf("complaint from %d was not answered",
+					fmt.Sprintf("complaint from (%d) was not answered",
 						complainer))
 				break
 			}
@@ -412,7 +412,7 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) {
 
 	if s.vAReceived {
 		if !s.verifyShare() {
-			// otherwise, build a complaint
+			// build a complaint
 			s.buildAndBroadcastComplaint()
 		}
 	}
@@ -465,8 +465,8 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) {
 			if s.checkComplaint(complainer, c) {
 				s.disqualified = true
 				s.processor.Disqualify(int(s.dealerIndex),
-					fmt.Sprintf("verification vector received: a complaint answer to %d is invalid",
-						complainer))
+					fmt.Sprintf("verification vector received: a complaint answer to (%d) is invalid, answer is %s, computed key is %s",
+						complainer, &c.answer, &s.y[complainer]))
 				return
 			}
 		}
@@ -482,6 +482,14 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) {
 // build a complaint against the dealer, add it to the local
 // complaint map and broadcast it
 func (s *feldmanVSSQualState) buildAndBroadcastComplaint() {
+	var logMsg string
+	if s.vAReceived && s.xReceived {
+		logMsg = fmt.Sprintf("building a complaint, share is %s, computed public key is %s",
+			&s.x, &s.y[s.myIndex])
+	} else {
+		logMsg = "building a complaint"
+	}
+	s.processor.FlagMisbehavior(int(s.dealerIndex), logMsg)
 	s.complaints[s.myIndex] = &complaint{
 		received:       true,
 		answerReceived: false,
@@ -582,8 +590,8 @@ func (s *feldmanVSSQualState) receiveComplaint(origin index, data []byte) {
 		s.disqualified = s.checkComplaint(origin, c)
 		if s.disqualified {
 			s.processor.Disqualify(int(s.dealerIndex),
-				fmt.Sprintf("complaint received: complaint answer to %d is invalid",
-					origin))
+				fmt.Sprintf("complaint received: answer to (%d) is invalid, answer is %s, computed public key is %s",
+					origin, &c.answer, &s.y[origin]))
 		}
 		return
 	}
@@ -656,8 +664,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte)
 			s.disqualified = s.checkComplaint(complainer, c)
 			if s.disqualified {
 				s.processor.Disqualify(int(s.dealerIndex),
-					fmt.Sprintf("complaint answer received: complaint answer to %d is invalid",
-						complainer))
+					fmt.Sprintf("complaint answer received: answer to (%d) is invalid, answer is %s, computed key is %s",
+						complainer, &c.answer, &s.y[complainer]))
 			}
 		}
 
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index b15c421dde6..8de9695a0c5 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -194,7 +194,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) {
 	if disqualifiedTotal > s.threshold || s.size-disqualifiedTotal <= s.threshold {
 		return nil, nil, nil,
 			dkgFailureErrorf(
-				"Joint-Feldman failed because the diqualified participants number is high: %d disqualified, threshold is %d, size is %d",
+				"Joint-Feldman failed because the disqualified participants number is high: %d disqualified, threshold is %d, size is %d",
 				disqualifiedTotal, s.threshold, s.size)
 	}
 

From 5e01e46951cd2e2967f78ccd8e1fc7395764185f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 1 May 2023 10:57:47 -0600
Subject: [PATCH 058/200] use new KeyGenSeedMinLen

---
 cmd/bootstrap/cmd/dkg.go                                    | 2 +-
 cmd/bootstrap/dkg/dkg_test.go                               | 2 +-
 cmd/bootstrap/run/qc_test.go                                | 2 +-
 consensus/hotstuff/signature/randombeacon_inspector_test.go | 2 +-
 engine/consensus/dkg/reactor_engine.go                      | 2 +-
 integration/testnet/util.go                                 | 2 +-
 module/dkg/controller_test.go                               | 2 +-
 state/protocol/badger/validity_test.go                      | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmd/bootstrap/cmd/dkg.go b/cmd/bootstrap/cmd/dkg.go
index b190b1a7c2c..de7f01bc6bd 100644
--- a/cmd/bootstrap/cmd/dkg.go
+++ b/cmd/bootstrap/cmd/dkg.go
@@ -22,7 +22,7 @@ func runDKG(nodes []model.NodeInfo) dkg.DKGData {
 	if flagFastKG {
 		dkgData, err = bootstrapDKG.RunFastKG(n, flagBootstrapRandomSeed)
 	} else {
-		dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.SeedMinLenDKG))
+		dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.KeyGenSeedMinLen))
 	}
 	if err != nil {
 		log.Fatal().Err(err).Msg("error running DKG")
diff --git a/cmd/bootstrap/dkg/dkg_test.go b/cmd/bootstrap/dkg/dkg_test.go
index 9835cdca538..73fb185ec33 100644
--- a/cmd/bootstrap/dkg/dkg_test.go
+++ b/cmd/bootstrap/dkg/dkg_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func TestRunDKG(t *testing.T) {
-	seedLen := crypto.SeedMinLenDKG
+	seedLen := crypto.KeyGenSeedMinLen
 	_, err := RunDKG(0, unittest.SeedFixtures(2, seedLen))
 	require.EqualError(t, err, "n needs to match the number of seeds (0 != 2)")
 
diff --git a/cmd/bootstrap/run/qc_test.go b/cmd/bootstrap/run/qc_test.go
index 5deed36d1ed..4f925a5e793 100644
--- a/cmd/bootstrap/run/qc_test.go
+++ b/cmd/bootstrap/run/qc_test.go
@@ -50,7 +50,7 @@ func createSignerData(t *testing.T, n int) *ParticipantData {
 	networkingKeys := unittest.NetworkingKeys(n)
 	stakingKeys := unittest.StakingKeys(n)
 
-	seed := make([]byte, crypto.SeedMinLenDKG)
+	seed := make([]byte, crypto.KeyGenSeedMinLen)
 	_, err := rand.Read(seed)
 	require.NoError(t, err)
 	randomBSKs, randomBPKs, groupKey, err := crypto.BLSThresholdKeyGen(n,
diff --git a/consensus/hotstuff/signature/randombeacon_inspector_test.go b/consensus/hotstuff/signature/randombeacon_inspector_test.go
index 5784577f668..5e2a08e7c91 100644
--- a/consensus/hotstuff/signature/randombeacon_inspector_test.go
+++ b/consensus/hotstuff/signature/randombeacon_inspector_test.go
@@ -40,7 +40,7 @@ func (rs *randomBeaconSuite) SetupTest() {
 
 	// generate threshold keys
 	mrand.Seed(time.Now().UnixNano())
-	seed := make([]byte, crypto.SeedMinLenDKG)
+	seed := make([]byte, crypto.KeyGenSeedMinLen)
 	_, err := mrand.Read(seed)
 	require.NoError(rs.T(), err)
 	rs.skShares, rs.pkShares, rs.pkGroup, err = crypto.BLSThresholdKeyGen(rs.n, rs.threshold, seed)
diff --git a/engine/consensus/dkg/reactor_engine.go b/engine/consensus/dkg/reactor_engine.go
index 1704483ef48..1d23344e4c6 100644
--- a/engine/consensus/dkg/reactor_engine.go
+++ b/engine/consensus/dkg/reactor_engine.go
@@ -348,7 +348,7 @@ func (e *ReactorEngine) getDKGInfo(firstBlockID flow.Identifier) (*dkgInfo, erro
 	if err != nil {
 		return nil, fmt.Errorf("could not retrieve epoch dkg final views: %w", err)
 	}
-	seed := make([]byte, crypto.SeedMinLenDKG)
+	seed := make([]byte, crypto.KeyGenSeedMinLen)
 	_, err = rand.Read(seed)
 	if err != nil {
 		return nil, fmt.Errorf("could not generate random seed: %w", err)
diff --git a/integration/testnet/util.go b/integration/testnet/util.go
index ad45be97c82..52ab6af17a0 100644
--- a/integration/testnet/util.go
+++ b/integration/testnet/util.go
@@ -71,7 +71,7 @@ func toNodeInfos(confs []ContainerConfig) []bootstrap.NodeInfo {
 }
 
 func getSeed() ([]byte, error) {
-	seedLen := int(math.Max(crypto.SeedMinLenDKG, crypto.KeyGenSeedMinLen))
+	seedLen := int(math.Max(crypto.KeyGenSeedMinLen, crypto.KeyGenSeedMinLen))
 	seed := make([]byte, seedLen)
 	n, err := rand.Read(seed)
 	if err != nil || n != seedLen {
diff --git a/module/dkg/controller_test.go b/module/dkg/controller_test.go
index 03f10adf1c1..2e3b8cce8b5 100644
--- a/module/dkg/controller_test.go
+++ b/module/dkg/controller_test.go
@@ -248,7 +248,7 @@ func initNodes(t *testing.T, n int, phase1Duration, phase2Duration, phase3Durati
 			logger:            logger,
 		}
 
-		seed := unittest.SeedFixture(20)
+		seed := unittest.SeedFixture(crypto.KeyGenSeedMinLen)
 
 		dkg, err := crypto.NewJointFeldman(n, signature.RandomBeaconThreshold(n), i, broker)
 		require.NoError(t, err)
diff --git a/state/protocol/badger/validity_test.go b/state/protocol/badger/validity_test.go
index 2c0e3372e4b..9d564d76e30 100644
--- a/state/protocol/badger/validity_test.go
+++ b/state/protocol/badger/validity_test.go
@@ -49,7 +49,7 @@ func TestEpochSetupValidity(t *testing.T) {
 	t.Run("short seed", func(t *testing.T) {
 		_, result, _ := unittest.BootstrapFixture(participants)
 		setup := result.ServiceEvents[0].Event.(*flow.EpochSetup)
-		setup.RandomSource = unittest.SeedFixture(crypto.SeedMinLenDKG - 1)
+		setup.RandomSource = unittest.SeedFixture(crypto.KeyGenSeedMinLen - 1)
 
 		err := verifyEpochSetup(setup, true)
 		require.Error(t, err)

From bbd3c74797ef92ed6438ea98dde842c0a5211ef4 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 2 May 2023 17:39:30 -0600
Subject: [PATCH 059/200] fix a bug when zeroring a buffer that is not a
 multiple of 8 bytes with BLST's vec_zero

---
 crypto/bls12381_utils.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index b9ec974fee3..2ef4ca2e3e2 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -277,12 +277,12 @@ void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) {
 
 // internal type of BLST `pow256` uses bytes little endian.
 // input is bytes big endian as used by Flow crypto lib external scalars.
-static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
+static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES])
 {
-    unsigned char* b = (unsigned char*)a + Fr_BYTES - 1;
+    byte* b = (byte*)a + Fr_BYTES - 1;
     if ((uptr_t)ret == (uptr_t)a) { // swap in place
         for (int i=0; i<Fr_BYTES/2; i++) {
-            unsigned char tmp = *ret;
+            byte tmp = *ret;
             *(ret++) = *b;
             *(b--) = tmp;
         }
@@ -294,6 +294,7 @@ static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES])
 }
 
 // internal type of BLST `pow256` uses bytes little endian.
+// TODO: check endianness!!
 static void pow256_from_Fr(pow256 ret, const Fr* in) {
     le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES);
 }
@@ -310,12 +311,12 @@ BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
     }
     pow256 tmp;
     // compare to r using the provided tool from BLST 
-    pow256_from_be_bytes(tmp, bin);
+    pow256_from_be_bytes(tmp, bin);  // TODO: check endianness!!
     if (!check_mod_256(tmp, BLS12_381_r)) {  // check_mod_256 compares pow256 against a vec256!
         return BLST_BAD_SCALAR;
     }
     vec_zero(tmp, sizeof(tmp));
-    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES);
+    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); // TODO: check endianness!!
     return BLST_SUCCESS;
 }
 
@@ -1009,9 +1010,9 @@ void E2_mult(E2* res, const E2* p, const Fr* expo) {
 
 // Exponentiation of a generic point `a` in E2 by a byte exponent.
 void  E2_mult_small_expo(E2* res, const E2* p, const byte expo) {
-    pow256 pow_expo; // `pow256` uses bytes little endian.
-    pow_expo[0] = expo;
-    vec_zero(&pow_expo[1], 32-1);
+    pow256 pow_expo; 
+    vec_zero(&pow_expo, sizeof(pow256)); 
+    pow_expo[0] = expo; // `pow256` uses bytes little endian.
     // TODO: to bench against a specific version of mult with 8 bits expo
     POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, pow_expo);
 }

From d6b4e86a5dbe195603b89593dda19c86b52dca78 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 2 May 2023 19:11:58 -0600
Subject: [PATCH 060/200] clean ups

---
 cmd/bootstrap/cmd/dkg.go | 2 +-
 crypto/bls12381_utils.c  | 7 +++----
 crypto/bls_core.c        | 3 ---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/cmd/bootstrap/cmd/dkg.go b/cmd/bootstrap/cmd/dkg.go
index de7f01bc6bd..da81d0551c6 100644
--- a/cmd/bootstrap/cmd/dkg.go
+++ b/cmd/bootstrap/cmd/dkg.go
@@ -20,7 +20,7 @@ func runDKG(nodes []model.NodeInfo) dkg.DKGData {
 	var dkgData dkg.DKGData
 	var err error
 	if flagFastKG {
-		dkgData, err = bootstrapDKG.RunFastKG(n, flagBootstrapRandomSeed)
+		dkgData, err = bootstrapDKG.RunFastKG(n, GenerateRandomSeed(crypto.KeyGenSeedMinLen))
 	} else {
 		dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.KeyGenSeedMinLen))
 	}
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 3a0ae79fcc9..d57d31c5861 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -116,10 +116,9 @@ prec_st* init_precomputed_data_BLS12_381() {
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
-const Fr BLS12_381_rR = {                   /* R mod r = (1<<256)%r */
-    TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5),
-    TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe)
-};
+// R mod r = (1<<256)%r 
+const Fr BLS12_381_rR = { TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
+                          TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), };
 
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index ca51d2dc09f..58a7287578f 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -472,7 +472,6 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
     if (!sigs) goto out_sigs;
     for (int i=0; i < sigs_len; i++) {
         ep_new(sigs[i]);
-        ep2_new(pks[i]);
     }
 
     for (int i=0; i < sigs_len; i++) {
@@ -501,8 +500,6 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
             // r = random + 1
             Fr_set_limb(&one, 1);
             Fr_add(&r, &r, &one); 
-            /*char str[20]; sprintf(str, "r-%d", i);
-            Fr_print_(str, &r);*/
             // multiply public key and signature by the same random exponent r
             E2_mult(&pks[i], &pks_input[i], &r);  // TODO: faster version for short expos?
             bn_st* tmp = Fr_blst_to_relic(&r);

From 9f20a59cc224d64cac53f55e5504cb23133fe453 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 3 May 2023 12:08:32 -0600
Subject: [PATCH 061/200] clean up encode/decode tests in BLS

---
 crypto/bls_test.go        | 80 +++++++++++++++++++++------------------
 crypto/sign_test_utils.go |  2 +-
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index c7f58fec010..c3e9bb6e9db 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -180,29 +180,35 @@ func TestBLSEncodeDecode(t *testing.T) {
 	// specific tests for BLS
 
 	//  zero private key
-	skBytes := make([]byte, PrKeyLenBLSBLS12381)
-	sk, err := DecodePrivateKey(BLSBLS12381, skBytes)
-	require.Error(t, err, "decoding identity private key should fail")
-	assert.True(t, IsInvalidInputsError(err))
-	assert.Nil(t, sk)
-
-	//  decode an identity public key
-	pkBytes := make([]byte, PubKeyLenBLSBLS12381)
-	pkBytes[0] = infinityPointHeader
-	pk, err := DecodePublicKey(BLSBLS12381, pkBytes)
-	require.NoError(t, err, "decoding identity public key should succeed")
-	assert.True(t, pk.Equals(IdentityBLSPublicKey()))
-
-	// encode an identity public key
-	assert.Equal(t, pk.Encode(), pkBytes)
+	t.Run("zero private key", func(t *testing.T) {
+		skBytes := make([]byte, PrKeyLenBLSBLS12381)
+		sk, err := DecodePrivateKey(BLSBLS12381, skBytes)
+		require.Error(t, err, "decoding identity private key should fail")
+		assert.True(t, IsInvalidInputsError(err))
+		assert.Nil(t, sk)
+	})
+
+	//  identity public key
+	t.Run("infinity public key", func(t *testing.T) {
+		//  decode an identity public key
+		pkBytes := make([]byte, PubKeyLenBLSBLS12381)
+		pkBytes[0] = infinityPointHeader
+		pk, err := DecodePublicKey(BLSBLS12381, pkBytes)
+		require.NoError(t, err, "decoding identity public key should succeed")
+		assert.True(t, pk.Equals(IdentityBLSPublicKey()))
+		// encode an identity public key
+		assert.Equal(t, pk.Encode(), pkBytes)
+	})
 
 	// invalid point
-	pkBytes = make([]byte, PubKeyLenBLSBLS12381)
-	pkBytes[0] = invalidBLSSignatureHeader
-	pk, err = DecodePublicKey(BLSBLS12381, pkBytes)
-	require.Error(t, err, "the key decoding should fail - key value is invalid")
-	assert.True(t, IsInvalidInputsError(err))
-	assert.Nil(t, pk)
+	t.Run("invalid public key", func(t *testing.T) {
+		pkBytes := make([]byte, PubKeyLenBLSBLS12381)
+		pkBytes[0] = invalidBLSSignatureHeader
+		pk, err := DecodePublicKey(BLSBLS12381, pkBytes)
+		require.Error(t, err, "the key decoding should fail - key value is invalid")
+		assert.True(t, IsInvalidInputsError(err))
+		assert.Nil(t, pk)
+	})
 
 	// Test a public key serialization with a point encoded with a coordinate x with
 	// x[0] or x[1] not reduced mod p.
@@ -213,21 +219,23 @@ func TestBLSEncodeDecode(t *testing.T) {
 	// Although uniqueness of public key respresentation isn't a security property, some implementations
 	// may implicitely rely on the property.
 
-	// valid pk with x[0] < p and x[1] < p
-	validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2")
-	require.NoError(t, err)
-	_, err = DecodePublicKey(BLSBLS12381, validPk)
-	assert.NoError(t, err)
-	// invalidpk1 with x[0]+p and same x[1]
-	invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2")
-	require.NoError(t, err)
-	_, err = DecodePublicKey(BLSBLS12381, invalidPk1)
-	assert.Error(t, err)
-	// invalidpk1 with same x[0] and x[1]+p
-	invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D")
-	require.NoError(t, err)
-	_, err = DecodePublicKey(BLSBLS12381, invalidPk2)
-	assert.Error(t, err)
+	t.Run("public key with non-reduced coordinates", func(t *testing.T) {
+		// valid pk with x[0] < p and x[1] < p
+		validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2")
+		require.NoError(t, err)
+		_, err = DecodePublicKey(BLSBLS12381, validPk)
+		assert.NoError(t, err)
+		// invalidpk1 with x[0]+p and same x[1]
+		invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2")
+		require.NoError(t, err)
+		_, err = DecodePublicKey(BLSBLS12381, invalidPk1)
+		assert.Error(t, err)
+		// invalidpk1 with same x[0] and x[1]+p
+		invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D")
+		require.NoError(t, err)
+		_, err = DecodePublicKey(BLSBLS12381, invalidPk2)
+		assert.Error(t, err)
+	})
 }
 
 // TestBLSEquals tests equal for BLS keys
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 6ab599cff47..8e2cd1e931f 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -160,7 +160,7 @@ var BLS12381Order = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33,
 	0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01}
 
 func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
-	t.Run(fmt.Sprintf("encode/decode for %s", salg), func(t *testing.T) {
+	t.Run(fmt.Sprintf("generic encode/decode for %s", salg), func(t *testing.T) {
 		rand := getPRG(t)
 
 		t.Run("happy path tests", func(t *testing.T) {

From f2731a8a0030ba70f6e3e8d1e8f0c5fe47546a31 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 3 May 2023 12:12:31 -0600
Subject: [PATCH 062/200] add endianness test for  maptoFr

---
 crypto/bls12381_utils.go      |  2 +-
 crypto/bls12381_utils_test.go | 43 ++++++++++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 6bbca17f2a5..0f685494d4f 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -160,7 +160,7 @@ func randFrStar(x *scalar, rand random.Rand) {
 	}
 }
 
-// mapToFr reads a scalar from a slice of bytes and maps it to Zr.
+// mapToFr reads a scalar from a slice of bytes and maps it to Fr using modular reduction.
 // The resulting element `k` therefore satisfies 0 <= k < r.
 // It returns true if scalar is zero and false otherwise.
 func mapToFr(x *scalar, src []byte) bool {
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 23e13d303ce..563ca26811b 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -54,7 +54,6 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 
 // Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve
 func TestMapToG1(t *testing.T) {
-
 	// test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1
 	dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_")
 
@@ -160,5 +159,47 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 		}
 		b.StopTimer()
 	})
+}
 
+// test some edge cases of MapToFr to validate modular reduction and endianness:
+//   - inputs `0` and curve order `r`
+//   - inputs `1` and `r+1`
+func TestMapToFr(t *testing.T) {
+	var x scalar
+	offset := 10
+	bytes := make([]byte, frBytesLen+offset)
+	expectedEncoding := make([]byte, frBytesLen)
+	// zero bytes
+	isZero := mapToFr(&x, bytes)
+	assert.True(t, isZero)
+	assert.True(t, x.isZero())
+	assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode())
+	// curve order bytes
+	copy(bytes[offset:], BLS12381Order)
+	isZero = mapToFr(&x, bytes)
+	assert.True(t, isZero)
+	assert.True(t, x.isZero())
+	assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode())
+	// curve order + 1
+	g1, err := hex.DecodeString("824aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb813e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e")
+	require.NoError(t, err)
+	bytes[len(bytes)-1] += 1
+	isZero = mapToFr(&x, bytes)
+	assert.False(t, isZero)
+	assert.False(t, x.isZero())
+	expectedEncoding[frBytesLen-1] = 1
+	sk := newPrKeyBLSBLS12381(&x)
+	assert.Equal(t, expectedEncoding, sk.Encode())
+	// check scalar is equal to "1" in the lower layer (scalar multiplication)
+	assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer")
+	// 1
+	copy(bytes[offset:], expectedEncoding)
+	isZero = mapToFr(&x, bytes)
+	assert.False(t, isZero)
+	assert.False(t, x.isZero())
+	expectedEncoding[frBytesLen-1] = 1
+	sk = newPrKeyBLSBLS12381(&x)
+	assert.Equal(t, expectedEncoding, sk.Encode())
+	// check scalar is equal to "1" in the lower layer (scalar multiplication)
+	assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer")
 }

From 70c3c64734e64e8207b5fd3b72d57f1e4ce6c7ac Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 3 May 2023 12:39:54 -0600
Subject: [PATCH 063/200] add endianness comment

---
 crypto/bls12381_utils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index d57d31c5861..ccec6c78d17 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -329,11 +329,12 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) {
 
 // write Fr element `a` in big endian bytes.
 void Fr_write_bytes(byte *bin, const Fr* a) {
+    // be_bytes_from_limbs works for both limb endiannesses
     be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
 }
 
 // maps big-endian bytes into an Fr element using modular reduction
-// Input is byte-big-endian, output is vec256 (also used as Fr)
+// Input is byte-big-endian, output is Fr (internally vec256)
 static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 {
     Fr digit, radix;
@@ -342,6 +343,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 
     byte* p = (byte*)bytes + n;
     while (n > Fr_BYTES) {
+        // limbs_from_be_bytes works for both limb endiannesses
         limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
         Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i  (i is the loop number starting at 1)
         Fr_add(out, out, &digit);

From 21c468693a6a7fcaf03910ef6cc729772ab8b1b5 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 13:20:19 -0600
Subject: [PATCH 064/200] add blst Go package as an internal test package

---
 crypto/internal/blst/blst.go    | 3346 +++++++++++++++++++++++++++++++
 crypto/internal/blst/blst.h     |  483 +++++
 crypto/internal/blst/blst_aux.h |  111 +
 3 files changed, 3940 insertions(+)
 create mode 100644 crypto/internal/blst/blst.go
 create mode 100644 crypto/internal/blst/blst.h
 create mode 100644 crypto/internal/blst/blst_aux.h

diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go
new file mode 100644
index 00000000000..97b9047d1e3
--- /dev/null
+++ b/crypto/internal/blst/blst.go
@@ -0,0 +1,3346 @@
+/*
+ * Copied from https://github.com/supranational/blst
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package blst
+
+// #cgo CFLAGS: -I${SRCDIR} -I${SRCDIR}/../../blst_src/build -I${SRCDIR}/../../blst_src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
+// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
+// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
+// #include "blst.h"
+//
+// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__))
+// # include <signal.h>
+// # include <unistd.h>
+// static void handler(int signum)
+// {   ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, "
+//                          "consult <blst>/bindings/go/README.md.\n", 70);
+//     _exit(128+SIGILL);
+//     (void)n;
+// }
+// __attribute__((constructor)) static void blst_cgo_init()
+// {   blst_fp temp = { 0 };
+//     struct sigaction act = { handler }, oact;
+//     sigaction(SIGILL, &act, &oact);
+//     blst_fp_sqr(&temp, &temp);
+//     sigaction(SIGILL, &oact, NULL);
+// }
+// #endif
+//
+// static size_t go_pairing_sizeof(size_t DST_len)
+// {   return (blst_pairing_sizeof() + DST_len + sizeof(blst_pairing) - 1) /
+//            sizeof(blst_pairing);
+// }
+// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode,
+//                             const byte *DST, size_t DST_len)
+// {   if (DST != NULL) {
+//         byte *dst = (byte*)new_ctx + blst_pairing_sizeof();
+//         for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i];
+//         DST = dst;
+//     }
+//     blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len);
+// }
+// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx)
+// {   *pt = *blst_pairing_as_fp12(ctx);   }
+//
+// static void go_p1slice_to_affine(blst_p1_affine dst[],
+//                                  const blst_p1 points[], size_t npoints)
+// {   const blst_p1 *ppoints[2] = { points, NULL };
+//     blst_p1s_to_affine(dst, ppoints, npoints);
+// }
+// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[],
+//                                          size_t npoints)
+// {   const blst_p1_affine *ppoints[2] = { points, NULL };
+//     blst_p1s_add(dst, ppoints, npoints);
+// }
+// static void go_p2slice_to_affine(blst_p2_affine dst[],
+//                                  const blst_p2 points[], size_t npoints)
+// {   const blst_p2 *ppoints[2] = { points, NULL };
+//     blst_p2s_to_affine(dst, ppoints, npoints);
+// }
+// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[],
+//                                          size_t npoints)
+// {   const blst_p2_affine *ppoints[2] = { points, NULL };
+//     blst_p2s_add(dst, ppoints, npoints);
+// }
+//
+// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine,
+//                                            const byte *scalar, size_t nbits)
+// {   blst_p1 m[1];
+//     const void *p = x;
+//     if (p == NULL)
+//         p = blst_p1_generator();
+//     else if (affine)
+//         blst_p1_from_affine(m, p), p = m;
+//     blst_p1_mult(m, p, scalar, nbits);
+//     blst_p1_add_or_double(acc, acc, m);
+// }
+// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine,
+//                                            const byte *scalar, size_t nbits)
+// {   blst_p2 m[1];
+//     const void *p = x;
+//     if (p == NULL)
+//         p = blst_p2_generator();
+//     else if (affine)
+//         blst_p2_from_affine(m, p), p = m;
+//     blst_p2_mult(m, p, scalar, nbits);
+//     blst_p2_add_or_double(acc, acc, m);
+// }
+//
+// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine)
+// {   blst_p1 minus_b;
+//     if (affine)
+//         blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x);
+//     else
+//         minus_b = *(const blst_p1*)x;
+//     blst_p1_cneg(&minus_b, 1);
+//     blst_p1_add_or_double(a, a, &minus_b);
+// }
+//
+// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine)
+// {   blst_p2 minus_b;
+//     if (affine)
+//         blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x);
+//     else
+//         minus_b = *(const blst_p2*)x;
+//     blst_p2_cneg(&minus_b, 1);
+//     blst_p2_add_or_double(a, a, &minus_b);
+// }
+//
+// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in)
+// {   blst_scalar_from_bendian(ret, in);
+//     return blst_sk_check(ret);
+// }
+// static bool go_hash_to_scalar(blst_scalar *ret,
+//                               const byte *msg, size_t msg_len,
+//                               const byte *DST, size_t DST_len)
+// {   byte elem[48];
+//     blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len);
+//     return blst_scalar_from_be_bytes(ret, elem, sizeof(elem));
+// }
+import "C"
+import (
+	"fmt"
+	"math/bits"
+	"runtime"
+	"sync"
+	"sync/atomic"
+)
+
+const BLST_SCALAR_BYTES = 256 / 8
+const BLST_FP_BYTES = 384 / 8
+const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES
+const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2
+const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2
+const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4
+
+type Scalar = C.blst_scalar
+type Fp = C.blst_fp
+type Fp2 = C.blst_fp2
+type Fp6 = C.blst_fp6
+type Fp12 = C.blst_fp12
+type P1 = C.blst_p1
+type P2 = C.blst_p2
+type P1Affine = C.blst_p1_affine
+type P2Affine = C.blst_p2_affine
+type Message = []byte
+type Pairing = []C.blst_pairing
+type SecretKey = Scalar
+type P1s []P1
+type P2s []P2
+type P1Affines []P1Affine
+type P2Affines []P2Affine
+
+//
+// Configuration
+//
+
+var maxProcs = initMaxProcs()
+
+func initMaxProcs() int {
+	maxProcs := runtime.GOMAXPROCS(0)
+	var version float32
+	_, err := fmt.Sscanf(runtime.Version(), "go%f", &version)
+	if err != nil || version < 1.14 {
+		// be cooperative and leave one processor for the application
+		maxProcs -= 1
+	}
+	if maxProcs <= 0 {
+		maxProcs = 1
+	}
+	return maxProcs
+}
+
+func SetMaxProcs(max int) {
+	if max <= 0 {
+		max = 1
+	}
+	maxProcs = max
+}
+
+// Secret key
+func (sk *SecretKey) Zeroize() {
+	var zero SecretKey
+	*sk = zero
+}
+
+func KeyGen(ikm []byte, optional ...[]byte) *SecretKey {
+	var sk SecretKey
+	var info []byte
+	var infoP *C.byte
+	if len(optional) > 0 {
+		info = optional[0]
+		if len(info) > 0 {
+			infoP = (*C.byte)(&info[0])
+		}
+	}
+	if len(ikm) < 32 {
+		return nil
+	}
+	C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)),
+		infoP, C.size_t(len(info)))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey {
+	if len(ikm) < 32 {
+		return nil
+	}
+	var sk SecretKey
+	var info []byte
+	var infoP *C.byte
+	if len(optional) > 0 {
+		info = optional[0]
+		if len(info) > 0 {
+			infoP = (*C.byte)(&info[0])
+		}
+	}
+	C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)),
+		infoP, C.size_t(len(info)))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey {
+	if len(ikm) < 32 {
+		return nil
+	}
+	var sk SecretKey
+	var info []byte
+	var infoP *C.byte
+	if len(optional) > 0 {
+		info = optional[0]
+		if len(info) > 0 {
+			infoP = (*C.byte)(&info[0])
+		}
+	}
+	C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)),
+		(*C.byte)(&salt[0]), C.size_t(len(salt)),
+		infoP, C.size_t(len(info)))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey {
+	if len(ikm) < 32 {
+		return nil
+	}
+	var sk SecretKey
+	var info []byte
+	var infoP *C.byte
+	if len(optional) > 0 {
+		info = optional[0]
+		if len(info) > 0 {
+			infoP = (*C.byte)(&info[0])
+		}
+	}
+	C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)),
+		(*C.byte)(&salt[0]), C.size_t(len(salt)),
+		infoP, C.size_t(len(info)))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+func DeriveMasterEip2333(ikm []byte) *SecretKey {
+	if len(ikm) < 32 {
+		return nil
+	}
+	var sk SecretKey
+	C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey {
+	var sk SecretKey
+	C.blst_derive_child_eip2333(&sk, master, C.uint(child_index))
+	// Postponing secret key zeroing till garbage collection can be too
+	// late to be effective, but every little bit helps...
+	runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() })
+	return &sk
+}
+
+// Pairing
+func PairingCtx(hash_or_encode bool, DST []byte) Pairing {
+	DST_len := C.size_t(len(DST))
+	ctx := make([]C.blst_pairing, int(C.go_pairing_sizeof(DST_len)))
+	var uDST *C.byte
+	if DST_len > 0 {
+		uDST = (*C.byte)(&DST[0])
+	}
+	C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), uDST, DST_len)
+	return ctx
+}
+
+func PairingCommit(ctx Pairing) {
+	C.blst_pairing_commit(&ctx[0])
+}
+
+func PairingMerge(ctx Pairing, ctx1 Pairing) int {
+	r := C.blst_pairing_merge(&ctx[0], &ctx1[0])
+	return int(r)
+}
+
+func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool {
+	var gtsig *Fp12 = nil
+	if len(optional) > 0 {
+		gtsig = optional[0]
+	}
+	return bool(C.blst_pairing_finalverify(&ctx[0], gtsig))
+}
+
+func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) {
+	C.blst_pairing_raw_aggregate(&ctx[0], q, p)
+}
+
+func PairingAsFp12(ctx Pairing) *Fp12 {
+	var pt Fp12
+	C.go_pairing_as_fp12(&pt, &ctx[0])
+	return &pt
+}
+
+func Fp12One() Fp12 {
+	return *C.blst_fp12_one()
+}
+
+func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool {
+	return bool(C.blst_fp12_finalverify(pt1, pt2))
+}
+
+func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 {
+	var pt Fp12
+	C.blst_miller_loop(&pt, q, p)
+	return &pt
+}
+
+func (pt *Fp12) MulAssign(p *Fp12) {
+	C.blst_fp12_mul(pt, pt, p)
+}
+
+func (pt *Fp12) FinalExp() {
+	C.blst_final_exp(pt, pt)
+}
+
+func (pt *Fp12) InGroup() bool {
+	return bool(C.blst_fp12_in_group(pt))
+}
+
+func (pt *Fp12) ToBendian() []byte {
+	var out [BLST_FP_BYTES * 12]byte
+	C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt)
+	return out[:]
+}
+
+//
+// MIN-PK
+//
+
+//
+// PublicKey
+//
+
+func (pk *P1Affine) From(s *Scalar) *P1Affine {
+	C.blst_sk_to_pk2_in_g1(nil, pk, s)
+	return pk
+}
+
+func (pk *P1Affine) KeyValidate() bool {
+	return !bool(C.blst_p1_affine_is_inf(pk)) &&
+		bool(C.blst_p1_affine_in_g1(pk))
+}
+
+// sigInfcheck, check for infinity, is a way to avoid going
+// into resource-consuming verification. Passing 'false' is
+// always cryptographically safe, but application might want
+// to guard against obviously bogus individual[!] signatures.
+func (sig *P2Affine) SigValidate(sigInfcheck bool) bool {
+	return (sigInfcheck && !bool(C.blst_p2_affine_is_inf(sig))) ||
+		bool(C.blst_p2_affine_in_g2(sig))
+}
+
+//
+// Sign
+//
+
+func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte,
+	optional ...interface{}) *P2Affine {
+	augSingle, aug, useHash, ok := parseOpts(optional...)
+	if !ok || len(aug) != 0 {
+		return nil
+	}
+
+	var q *P2
+	if useHash {
+		q = HashToG2(msg, dst, augSingle)
+	} else {
+		q = EncodeToG2(msg, dst, augSingle)
+	}
+	C.blst_sign_pk2_in_g1(nil, sig, q, sk)
+	return sig
+}
+
+//
+// Signature
+//
+
+// Functions to return a signature and public key+augmentation tuple.
+// This enables point decompression (if needed) to happen in parallel.
+type sigGetterP2 func() *P2Affine
+type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte)
+
+// Single verify with decompressed pk
+func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool,
+	msg Message, dst []byte,
+	optional ...interface{}) bool { // useHash bool, aug []byte
+
+	aug, _, useHash, ok := parseOpts(optional...)
+	if !ok {
+		return false
+	}
+	return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate,
+		[]Message{msg}, dst, useHash, [][]byte{aug})
+}
+
+// Single verify with compressed pk
+// Uses a dummy signature to get the correct type
+func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool,
+	pk []byte, pkValidate bool, msg Message, dst []byte,
+	optional ...bool) bool { // useHash bool, usePksAsAugs bool
+
+	return dummy.AggregateVerifyCompressed(sig, sigGroupcheck,
+		[][]byte{pk}, pkValidate,
+		[]Message{msg}, dst, optional...)
+}
+
+// Aggregate verify with uncompressed signature and public keys
+// Note that checking message uniqueness, if required, is left to the user.
+// Not all signature schemes require it and this keeps the binding minimal
+// and fast. Refer to the Uniq function for one method method of performing
+// this check.
+func (sig *P2Affine) AggregateVerify(sigGroupcheck bool,
+	pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte,
+	optional ...interface{}) bool { // useHash bool, augs [][]byte
+
+	// sanity checks and argument parsing
+	n := len(pks)
+	if n == 0 || len(msgs) != n {
+		return false
+	}
+	_, augs, useHash, ok := parseOpts(optional...)
+	useAugs := len(augs) != 0
+	if !ok || (useAugs && len(augs) != n) {
+		return false
+	}
+
+	sigFn := func() *P2Affine {
+		return sig
+	}
+
+	pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) {
+		if useAugs {
+			return pks[i], augs[i]
+		} else {
+			return pks[i], nil
+		}
+	}
+
+	return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify,
+		msgs, dst, useHash)
+}
+
+// Aggregate verify with compressed signature and public keys
+// Uses a dummy signature to get the correct type
+func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool,
+	pks [][]byte, pksVerify bool, msgs []Message, dst []byte,
+	optional ...bool) bool { // useHash bool, usePksAsAugs bool
+
+	// sanity checks and argument parsing
+	if len(pks) != len(msgs) {
+		return false
+	}
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+	usePksAsAugs := false
+	if len(optional) > 1 {
+		usePksAsAugs = optional[1]
+	}
+
+	sigFn := func() *P2Affine {
+		sigP := new(P2Affine)
+		if sigP.Uncompress(sig) == nil {
+			return nil
+		}
+		return sigP
+	}
+	pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) {
+		bytes := pks[i]
+		if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 {
+			// Not compressed
+			if pk.Deserialize(bytes) == nil {
+				return nil, nil
+			}
+		} else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 {
+			if pk.Uncompress(bytes) == nil {
+				return nil, nil
+			}
+		} else {
+			return nil, nil
+		}
+		if usePksAsAugs {
+			return pk, bytes
+		}
+		return pk, nil
+	}
+	return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify,
+		msgs, dst, useHash)
+}
+
+func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool,
+	pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte,
+	optional ...bool) bool { // useHash
+
+	n := len(msgs)
+	if n == 0 {
+		return false
+	}
+
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding pk,msg[,aug] tuple and
+	// repeat until n is exceeded.  The resulting accumulations will be
+	// fed into the msgsCh channel.
+	msgsCh := make(chan Pairing, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+	mutex := sync.Mutex{}
+
+	mutex.Lock()
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			pairing := PairingCtx(useHash, dst)
+			var temp P1Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				} else if work == 0 && maxProcs == numCores-1 &&
+					numThreads == maxProcs {
+					// Avoid consuming all cores by waiting until the
+					// main thread has completed its miller loop before
+					// proceeding.
+					mutex.Lock()
+					mutex.Unlock()
+				}
+
+				// Pull Public Key and augmentation blob
+				curPk, aug := pkFn(work, &temp)
+				if curPk == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// Pairing and accumulate
+				ret := PairingAggregatePkInG1(pairing, curPk, pkValidate,
+					nil, false, msgs[work], aug)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				PairingCommit(pairing)
+				msgsCh <- pairing
+			} else {
+				msgsCh <- nil
+			}
+		}()
+	}
+
+	// Uncompress and check signature
+	var gtsig Fp12
+	sig := sigFn()
+	if sig == nil {
+		atomic.StoreInt32(&valid, 0)
+	}
+	if atomic.LoadInt32(&valid) > 0 && sigGroupcheck &&
+		!sig.SigValidate(false) {
+		atomic.StoreInt32(&valid, 0)
+	}
+	if atomic.LoadInt32(&valid) > 0 {
+		C.blst_aggregated_in_g2(&gtsig, sig)
+	}
+	mutex.Unlock()
+
+	// Accumulate the thread results
+	var pairings Pairing
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgsCh
+		if msg != nil {
+			if pairings == nil {
+				pairings = msg
+			} else {
+				ret := PairingMerge(pairings, msg)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+				}
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || pairings == nil {
+		return false
+	}
+
+	return PairingFinalVerify(pairings, &gtsig)
+}
+
+func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool,
+	msg Message, dst []byte, optional ...[]byte) int {
+
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+
+	if runtime.NumGoroutine() < maxProcs {
+		sigFn := func() *P2Affine {
+			return sig
+		}
+		pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) {
+			return pk, aug
+		}
+		if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg},
+			dst, hash_or_encode) {
+			return C.BLST_VERIFY_FAIL
+		}
+		return C.BLST_SUCCESS
+	}
+
+	var udst *C.byte
+	if len(dst) > 0 {
+		udst = (*C.byte)(&dst[0])
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode),
+		umsg, C.size_t(len(msg)),
+		udst, C.size_t(len(dst)),
+		uaug, C.size_t(len(aug))))
+}
+
+// pks are assumed to be verified for proof of possession,
+// which implies that they are already group-checked
+func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool,
+	pks []*P1Affine, msg Message, dst []byte,
+	optional ...interface{}) bool { // pass-through to Verify
+	n := len(pks)
+
+	// TODO: return value for length zero?
+	if n == 0 {
+		return false
+	}
+
+	aggregator := new(P1Aggregate)
+	if !aggregator.Aggregate(pks, false) {
+		return false
+	}
+	pkAff := aggregator.ToAffine()
+
+	// Verify
+	return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...)
+}
+
+func (dummy *P2Affine) MultipleAggregateVerify(sigs []*P2Affine,
+	sigsGroupcheck bool, pks []*P1Affine, pksVerify bool,
+	msgs []Message, dst []byte, randFn func(*Scalar), randBits int,
+	optional ...interface{}) bool { // useHash
+
+	// Sanity checks and argument parsing
+	n := len(pks)
+	if n == 0 || len(msgs) != n || len(sigs) != n {
+		return false
+	}
+	_, augs, useHash, ok := parseOpts(optional...)
+	useAugs := len(augs) != 0
+	if !ok || (useAugs && len(augs) != n) {
+		return false
+	}
+
+	paramsFn :=
+		func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) (
+			*P2Affine, *P1Affine, *Scalar, []byte) {
+			randFn(rand)
+			var aug []byte
+			if useAugs {
+				aug = augs[work]
+			}
+			return sigs[work], pks[work], rand, aug
+		}
+
+	return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify,
+		msgs, dst, randBits, useHash)
+}
+
+type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine,
+	rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte)
+
+func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1,
+	sigsGroupcheck bool, pksVerify bool, msgs []Message,
+	dst []byte, randBits int,
+	optional ...bool) bool { // useHash
+	n := len(msgs)
+	if n == 0 {
+		return false
+	}
+
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding pk,msg[,aug] tuple and
+	// repeat until n is exceeded.  The resulting accumulations will be
+	// fed into the msgsCh channel.
+	msgsCh := make(chan Pairing, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			pairing := PairingCtx(useHash, dst)
+			var tempRand Scalar
+			var tempPk P1Affine
+			var tempSig P2Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+
+				curSig, curPk, curRand, aug := paramsFn(work, &tempSig,
+					&tempPk, &tempRand)
+
+				if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify,
+					curSig, sigsGroupcheck, curRand,
+					randBits, msgs[work], aug) !=
+					C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				PairingCommit(pairing)
+				msgsCh <- pairing
+			} else {
+				msgsCh <- nil
+			}
+		}()
+	}
+
+	// Accumulate the thread results
+	var pairings Pairing
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgsCh
+		if msg != nil {
+			if pairings == nil {
+				pairings = msg
+			} else {
+				ret := PairingMerge(pairings, msg)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+				}
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || pairings == nil {
+		return false
+	}
+
+	return PairingFinalVerify(pairings, nil)
+}
+
+//
+// Aggregate P2
+//
+
+type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine
+type P2Aggregate struct {
+	v *P2
+}
+
+// Aggregate uncompressed elements
+func (agg *P2Aggregate) Aggregate(elmts []*P2Affine,
+	groupcheck bool) bool {
+	if len(elmts) == 0 {
+		return true
+	}
+	getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] }
+	return agg.aggregate(getter, groupcheck, len(elmts))
+}
+
+// Aggregate compressed elements
+func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte,
+	groupcheck bool) bool {
+	if len(elmts) == 0 {
+		return true
+	}
+	getter := func(i uint32, p *P2Affine) *P2Affine {
+		bytes := elmts[i]
+		if p.Uncompress(bytes) == nil {
+			return nil
+		}
+		return p
+	}
+	return agg.aggregate(getter, groupcheck, len(elmts))
+}
+
+func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) {
+	if other.v == nil {
+		// do nothing
+	} else if agg.v == nil {
+		agg.v = other.v
+	} else {
+		C.blst_p2_add_or_double(agg.v, agg.v, other.v)
+	}
+}
+
+func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool {
+	if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) {
+		return false
+	}
+	if agg.v == nil {
+		agg.v = new(P2)
+		C.blst_p2_from_affine(agg.v, elmt)
+	} else {
+		C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt)
+	}
+	return true
+}
+
+func (agg *P2Aggregate) ToAffine() *P2Affine {
+	if agg.v == nil {
+		return new(P2Affine)
+	}
+	return agg.v.ToAffine()
+}
+
+func (agg *P2Aggregate) aggregate(getter aggGetterP2, groupcheck bool,
+	n int) bool {
+
+	if n == 0 {
+		return true
+	}
+	// operations are considered short enough for not to care about
+	// keeping one core free...
+	numThreads := runtime.GOMAXPROCS(0)
+	if numThreads > n {
+		numThreads = n
+	}
+
+	valid := int32(1)
+	type result struct {
+		agg   *P2
+		empty bool
+	}
+	msgs := make(chan result, numThreads)
+	curItem := uint32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			first := true
+			var agg P2
+			var temp P2Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+
+				// Signature validate
+				curElmt := getter(work, &temp)
+				if curElmt == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				if first {
+					C.blst_p2_from_affine(&agg, curElmt)
+					first = false
+				} else {
+					C.blst_p2_add_or_double_affine(&agg, &agg, curElmt)
+				}
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if first {
+				msgs <- result{nil, true}
+			} else if atomic.LoadInt32(&valid) > 0 {
+				msgs <- result{&agg, false}
+			} else {
+				msgs <- result{nil, false}
+			}
+		}()
+	}
+
+	// Accumulate the thread results
+	first := agg.v == nil
+	validLocal := true
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgs
+		if !validLocal || msg.empty {
+			// do nothing
+		} else if msg.agg == nil {
+			validLocal = false
+			// This should be unnecessary but seems safer
+			atomic.StoreInt32(&valid, 0)
+		} else {
+			if first {
+				agg.v = msg.agg
+				first = false
+			} else {
+				C.blst_p2_add_or_double(agg.v, agg.v, msg.agg)
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 {
+		agg.v = nil
+		return false
+	}
+	return true
+}
+
+//
+// MIN-SIG
+//
+
+//
+// PublicKey
+//
+
+func (pk *P2Affine) From(s *Scalar) *P2Affine {
+	C.blst_sk_to_pk2_in_g2(nil, pk, s)
+	return pk
+}
+
+func (pk *P2Affine) KeyValidate() bool {
+	return !bool(C.blst_p2_affine_is_inf(pk)) &&
+		bool(C.blst_p2_affine_in_g2(pk))
+}
+
+// sigInfcheck, check for infinity, is a way to avoid going
+// into resource-consuming verification. Passing 'false' is
+// always cryptographically safe, but application might want
+// to guard against obviously bogus individual[!] signatures.
+func (sig *P1Affine) SigValidate(sigInfcheck bool) bool {
+	return (sigInfcheck && !bool(C.blst_p1_affine_is_inf(sig))) ||
+		bool(C.blst_p1_affine_in_g1(sig))
+}
+
+//
+// Sign
+//
+
+func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte,
+	optional ...interface{}) *P1Affine {
+	augSingle, aug, useHash, ok := parseOpts(optional...)
+	if !ok || len(aug) != 0 {
+		return nil
+	}
+
+	var q *P1
+	if useHash {
+		q = HashToG1(msg, dst, augSingle)
+	} else {
+		q = EncodeToG1(msg, dst, augSingle)
+	}
+	C.blst_sign_pk2_in_g2(nil, sig, q, sk)
+	return sig
+}
+
+//
+// Signature
+//
+
+// Functions to return a signature and public key+augmentation tuple.
+// This enables point decompression (if needed) to happen in parallel.
+type sigGetterP1 func() *P1Affine
+type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte)
+
+// Single verify with decompressed pk
+func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool,
+	msg Message, dst []byte,
+	optional ...interface{}) bool { // useHash bool, aug []byte
+
+	aug, _, useHash, ok := parseOpts(optional...)
+	if !ok {
+		return false
+	}
+	return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate,
+		[]Message{msg}, dst, useHash, [][]byte{aug})
+}
+
+// Single verify with compressed pk
+// Uses a dummy signature to get the correct type
+func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool,
+	pk []byte, pkValidate bool, msg Message, dst []byte,
+	optional ...bool) bool { // useHash bool, usePksAsAugs bool
+
+	return dummy.AggregateVerifyCompressed(sig, sigGroupcheck,
+		[][]byte{pk}, pkValidate,
+		[]Message{msg}, dst, optional...)
+}
+
+// Aggregate verify with uncompressed signature and public keys
+// Note that checking message uniqueness, if required, is left to the user.
+// Not all signature schemes require it and this keeps the binding minimal
+// and fast. Refer to the Uniq function for one method method of performing
+// this check.
+func (sig *P1Affine) AggregateVerify(sigGroupcheck bool,
+	pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte,
+	optional ...interface{}) bool { // useHash bool, augs [][]byte
+
+	// sanity checks and argument parsing
+	n := len(pks)
+	if n == 0 || len(msgs) != n {
+		return false
+	}
+	_, augs, useHash, ok := parseOpts(optional...)
+	useAugs := len(augs) != 0
+	if !ok || (useAugs && len(augs) != n) {
+		return false
+	}
+
+	sigFn := func() *P1Affine {
+		return sig
+	}
+
+	pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) {
+		if useAugs {
+			return pks[i], augs[i]
+		} else {
+			return pks[i], nil
+		}
+	}
+
+	return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify,
+		msgs, dst, useHash)
+}
+
+// Aggregate verify with compressed signature and public keys
+// Uses a dummy signature to get the correct type
+func (dummy *P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool,
+	pks [][]byte, pksVerify bool, msgs []Message, dst []byte,
+	optional ...bool) bool { // useHash bool, usePksAsAugs bool
+
+	// sanity checks and argument parsing
+	if len(pks) != len(msgs) {
+		return false
+	}
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+	usePksAsAugs := false
+	if len(optional) > 1 {
+		usePksAsAugs = optional[1]
+	}
+
+	sigFn := func() *P1Affine {
+		sigP := new(P1Affine)
+		if sigP.Uncompress(sig) == nil {
+			return nil
+		}
+		return sigP
+	}
+	pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) {
+		bytes := pks[i]
+		if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 {
+			// Not compressed
+			if pk.Deserialize(bytes) == nil {
+				return nil, nil
+			}
+		} else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 {
+			if pk.Uncompress(bytes) == nil {
+				return nil, nil
+			}
+		} else {
+			return nil, nil
+		}
+		if usePksAsAugs {
+			return pk, bytes
+		}
+		return pk, nil
+	}
+	return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify,
+		msgs, dst, useHash)
+}
+
+func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool,
+	pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte,
+	optional ...bool) bool { // useHash
+
+	n := len(msgs)
+	if n == 0 {
+		return false
+	}
+
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding pk,msg[,aug] tuple and
+	// repeat until n is exceeded.  The resulting accumulations will be
+	// fed into the msgsCh channel.
+	msgsCh := make(chan Pairing, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+	mutex := sync.Mutex{}
+
+	mutex.Lock()
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			pairing := PairingCtx(useHash, dst)
+			var temp P2Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				} else if work == 0 && maxProcs == numCores-1 &&
+					numThreads == maxProcs {
+					// Avoid consuming all cores by waiting until the
+					// main thread has completed its miller loop before
+					// proceeding.
+					mutex.Lock()
+					mutex.Unlock()
+				}
+
+				// Pull Public Key and augmentation blob
+				curPk, aug := pkFn(work, &temp)
+				if curPk == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// Pairing and accumulate
+				ret := PairingAggregatePkInG2(pairing, curPk, pkValidate,
+					nil, false, msgs[work], aug)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				PairingCommit(pairing)
+				msgsCh <- pairing
+			} else {
+				msgsCh <- nil
+			}
+		}()
+	}
+
+	// Uncompress and check signature
+	var gtsig Fp12
+	sig := sigFn()
+	if sig == nil {
+		atomic.StoreInt32(&valid, 0)
+	}
+	if atomic.LoadInt32(&valid) > 0 && sigGroupcheck &&
+		!sig.SigValidate(false) {
+		atomic.StoreInt32(&valid, 0)
+	}
+	if atomic.LoadInt32(&valid) > 0 {
+		C.blst_aggregated_in_g1(&gtsig, sig)
+	}
+	mutex.Unlock()
+
+	// Accumulate the thread results
+	var pairings Pairing
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgsCh
+		if msg != nil {
+			if pairings == nil {
+				pairings = msg
+			} else {
+				ret := PairingMerge(pairings, msg)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+				}
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || pairings == nil {
+		return false
+	}
+
+	return PairingFinalVerify(pairings, &gtsig)
+}
+
+func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool,
+	msg Message, dst []byte, optional ...[]byte) int {
+
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+
+	if runtime.NumGoroutine() < maxProcs {
+		sigFn := func() *P1Affine {
+			return sig
+		}
+		pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) {
+			return pk, aug
+		}
+		if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg},
+			dst, hash_or_encode) {
+			return C.BLST_VERIFY_FAIL
+		}
+		return C.BLST_SUCCESS
+	}
+
+	var udst *C.byte
+	if len(dst) > 0 {
+		udst = (*C.byte)(&dst[0])
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	return int(C.blst_core_verify_pk_in_g2(pk, sig, C.bool(hash_or_encode),
+		umsg, C.size_t(len(msg)),
+		udst, C.size_t(len(dst)),
+		uaug, C.size_t(len(aug))))
+}
+
+// pks are assumed to be verified for proof of possession,
+// which implies that they are already group-checked
+func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool,
+	pks []*P2Affine, msg Message, dst []byte,
+	optional ...interface{}) bool { // pass-through to Verify
+	n := len(pks)
+
+	// TODO: return value for length zero?
+	if n == 0 {
+		return false
+	}
+
+	aggregator := new(P2Aggregate)
+	if !aggregator.Aggregate(pks, false) {
+		return false
+	}
+	pkAff := aggregator.ToAffine()
+
+	// Verify
+	return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...)
+}
+
+func (dummy *P1Affine) MultipleAggregateVerify(sigs []*P1Affine,
+	sigsGroupcheck bool, pks []*P2Affine, pksVerify bool,
+	msgs []Message, dst []byte, randFn func(*Scalar), randBits int,
+	optional ...interface{}) bool { // useHash
+
+	// Sanity checks and argument parsing
+	n := len(pks)
+	if n == 0 || len(msgs) != n || len(sigs) != n {
+		return false
+	}
+	_, augs, useHash, ok := parseOpts(optional...)
+	useAugs := len(augs) != 0
+	if !ok || (useAugs && len(augs) != n) {
+		return false
+	}
+
+	paramsFn :=
+		func(work uint32, sig *P1Affine, pk *P2Affine, rand *Scalar) (
+			*P1Affine, *P2Affine, *Scalar, []byte) {
+			randFn(rand)
+			var aug []byte
+			if useAugs {
+				aug = augs[work]
+			}
+			return sigs[work], pks[work], rand, aug
+		}
+
+	return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify,
+		msgs, dst, randBits, useHash)
+}
+
+type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine,
+	rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte)
+
+func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2,
+	sigsGroupcheck bool, pksVerify bool, msgs []Message,
+	dst []byte, randBits int,
+	optional ...bool) bool { // useHash
+	n := len(msgs)
+	if n == 0 {
+		return false
+	}
+
+	useHash := true
+	if len(optional) > 0 {
+		useHash = optional[0]
+	}
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding pk,msg[,aug] tuple and
+	// repeat until n is exceeded.  The resulting accumulations will be
+	// fed into the msgsCh channel.
+	msgsCh := make(chan Pairing, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			pairing := PairingCtx(useHash, dst)
+			var tempRand Scalar
+			var tempPk P2Affine
+			var tempSig P1Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+
+				curSig, curPk, curRand, aug := paramsFn(work, &tempSig,
+					&tempPk, &tempRand)
+
+				if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify,
+					curSig, sigsGroupcheck, curRand,
+					randBits, msgs[work], aug) !=
+					C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				PairingCommit(pairing)
+				msgsCh <- pairing
+			} else {
+				msgsCh <- nil
+			}
+		}()
+	}
+
+	// Accumulate the thread results
+	var pairings Pairing
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgsCh
+		if msg != nil {
+			if pairings == nil {
+				pairings = msg
+			} else {
+				ret := PairingMerge(pairings, msg)
+				if ret != C.BLST_SUCCESS {
+					atomic.StoreInt32(&valid, 0)
+				}
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || pairings == nil {
+		return false
+	}
+
+	return PairingFinalVerify(pairings, nil)
+}
+
+//
+// Aggregate P1
+//
+
+type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine
+type P1Aggregate struct {
+	v *P1
+}
+
+// Aggregate uncompressed elements
+func (agg *P1Aggregate) Aggregate(elmts []*P1Affine,
+	groupcheck bool) bool {
+	if len(elmts) == 0 {
+		return true
+	}
+	getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] }
+	return agg.aggregate(getter, groupcheck, len(elmts))
+}
+
+// Aggregate compressed elements
+func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte,
+	groupcheck bool) bool {
+	if len(elmts) == 0 {
+		return true
+	}
+	getter := func(i uint32, p *P1Affine) *P1Affine {
+		bytes := elmts[i]
+		if p.Uncompress(bytes) == nil {
+			return nil
+		}
+		return p
+	}
+	return agg.aggregate(getter, groupcheck, len(elmts))
+}
+
+func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) {
+	if other.v == nil {
+		// do nothing
+	} else if agg.v == nil {
+		agg.v = other.v
+	} else {
+		C.blst_p1_add_or_double(agg.v, agg.v, other.v)
+	}
+}
+
+func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool {
+	if groupcheck && !bool(C.blst_p1_affine_in_g1(elmt)) {
+		return false
+	}
+	if agg.v == nil {
+		agg.v = new(P1)
+		C.blst_p1_from_affine(agg.v, elmt)
+	} else {
+		C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt)
+	}
+	return true
+}
+
+func (agg *P1Aggregate) ToAffine() *P1Affine {
+	if agg.v == nil {
+		return new(P1Affine)
+	}
+	return agg.v.ToAffine()
+}
+
+func (agg *P1Aggregate) aggregate(getter aggGetterP1, groupcheck bool,
+	n int) bool {
+
+	if n == 0 {
+		return true
+	}
+	// operations are considered short enough for not to care about
+	// keeping one core free...
+	numThreads := runtime.GOMAXPROCS(0)
+	if numThreads > n {
+		numThreads = n
+	}
+
+	valid := int32(1)
+	type result struct {
+		agg   *P1
+		empty bool
+	}
+	msgs := make(chan result, numThreads)
+	curItem := uint32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			first := true
+			var agg P1
+			var temp P1Affine
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+
+				// Signature validate
+				curElmt := getter(work, &temp)
+				if curElmt == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				if groupcheck && !bool(C.blst_p1_affine_in_g1(curElmt)) {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				if first {
+					C.blst_p1_from_affine(&agg, curElmt)
+					first = false
+				} else {
+					C.blst_p1_add_or_double_affine(&agg, &agg, curElmt)
+				}
+				// application might have some async work to do
+				runtime.Gosched()
+			}
+			if first {
+				msgs <- result{nil, true}
+			} else if atomic.LoadInt32(&valid) > 0 {
+				msgs <- result{&agg, false}
+			} else {
+				msgs <- result{nil, false}
+			}
+		}()
+	}
+
+	// Accumulate the thread results
+	first := agg.v == nil
+	validLocal := true
+	for i := 0; i < numThreads; i++ {
+		msg := <-msgs
+		if !validLocal || msg.empty {
+			// do nothing
+		} else if msg.agg == nil {
+			validLocal = false
+			// This should be unnecessary but seems safer
+			atomic.StoreInt32(&valid, 0)
+		} else {
+			if first {
+				agg.v = msg.agg
+				first = false
+			} else {
+				C.blst_p1_add_or_double(agg.v, agg.v, msg.agg)
+			}
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 {
+		agg.v = nil
+		return false
+	}
+	return true
+}
+func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool,
+	sig *P2Affine, sigGroupcheck bool, msg []byte,
+	optional ...[]byte) int { // aug
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0],
+		PK, C.bool(pkValidate),
+		sig, C.bool(sigGroupcheck),
+		umsg, C.size_t(len(msg)),
+		uaug, C.size_t(len(aug)))
+
+	return int(r)
+}
+
+func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool,
+	sig *P2Affine, sigGroupcheck bool,
+	rand *Scalar, randBits int, msg []byte,
+	optional ...[]byte) int { // aug
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0],
+		PK, C.bool(pkValidate),
+		sig, C.bool(sigGroupcheck),
+		&rand.b[0], C.size_t(randBits),
+		umsg, C.size_t(len(msg)),
+		uaug, C.size_t(len(aug)))
+
+	return int(r)
+}
+
+//
+// Serialization/Deserialization.
+//
+
+// P1 Serdes
+func (p1 *P1Affine) Serialize() []byte {
+	var out [BLST_P1_SERIALIZE_BYTES]byte
+	C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1)
+	return out[:]
+}
+
+func (p1 *P1Affine) Deserialize(in []byte) *P1Affine {
+	if len(in) != BLST_P1_SERIALIZE_BYTES {
+		return nil
+	}
+	if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS {
+		return nil
+	}
+	return p1
+}
+func (p1 *P1Affine) Compress() []byte {
+	var out [BLST_P1_COMPRESS_BYTES]byte
+	C.blst_p1_affine_compress((*C.byte)(&out[0]), p1)
+	return out[:]
+}
+
+func (p1 *P1Affine) Uncompress(in []byte) *P1Affine {
+	if len(in) != BLST_P1_COMPRESS_BYTES {
+		return nil
+	}
+	if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS {
+		return nil
+	}
+	return p1
+}
+
+func (p1 *P1Affine) InG1() bool {
+	return bool(C.blst_p1_affine_in_g1(p1))
+}
+
+func (dummy *P1Affine) BatchUncompress(in [][]byte) []*P1Affine {
+	// Allocate space for all of the resulting points. Later we'll save pointers
+	// and return those so that the result could be used in other functions,
+	// such as MultipleAggregateVerify.
+	n := len(in)
+	points := make([]P1Affine, n)
+	pointsPtrs := make([]*P1Affine, n)
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding point, and
+	// repeat until n is exceeded. Each thread will send a result (true for
+	// success, false for failure) into the channel when complete.
+	resCh := make(chan bool, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+				if points[work].Uncompress(in[work]) == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				pointsPtrs[work] = &points[work]
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				resCh <- true
+			} else {
+				resCh <- false
+			}
+		}()
+	}
+
+	// Collect the threads
+	result := true
+	for i := 0; i < numThreads; i++ {
+		if !<-resCh {
+			result = false
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || !result {
+		return nil
+	}
+	return pointsPtrs
+}
+
+func (p1 *P1) Serialize() []byte {
+	var out [BLST_P1_SERIALIZE_BYTES]byte
+	C.blst_p1_serialize((*C.byte)(&out[0]), p1)
+	return out[:]
+}
+func (p1 *P1) Compress() []byte {
+	var out [BLST_P1_COMPRESS_BYTES]byte
+	C.blst_p1_compress((*C.byte)(&out[0]), p1)
+	return out[:]
+}
+
+func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 {
+	var nbits int
+	var scalar *C.byte
+	switch val := scalarIf.(type) {
+	case []byte:
+		scalar = (*C.byte)(&val[0])
+		nbits = len(val) * 8
+	case *Scalar:
+		scalar = &val.b[0]
+		nbits = 255
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	if len(optional) > 0 {
+		nbits = optional[0]
+	}
+	C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits))
+	return p1
+}
+
+func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 {
+	ret := *p1
+	return ret.MultAssign(scalarIf, optional...)
+}
+
+func (p1 *P1) AddAssign(pointIf interface{}) *P1 {
+	switch val := pointIf.(type) {
+	case *P1:
+		C.blst_p1_add_or_double(p1, p1, val)
+	case *P1Affine:
+		C.blst_p1_add_or_double_affine(p1, p1, val)
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	return p1
+}
+
+func (p1 *P1) Add(pointIf interface{}) *P1 {
+	ret := *p1
+	return ret.AddAssign(pointIf)
+}
+
+func (p1 *P1) SubAssign(pointIf interface{}) *P1 {
+	var x *Fp
+	var affine C.bool
+	switch val := pointIf.(type) {
+	case *P1:
+		x = &val.x
+		affine = false
+	case *P1Affine:
+		x = &val.x
+		affine = true
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	C.go_p1_sub_assign(p1, x, affine)
+	return p1
+}
+
+func (p1 *P1) Sub(pointIf interface{}) *P1 {
+	ret := *p1
+	return ret.SubAssign(pointIf)
+}
+
+func P1Generator() *P1 {
+	return C.blst_p1_generator()
+}
+
+// 'acc += point * scalar', passing 'nil' for 'point' means "use the
+//
+//	group generator point"
+func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{},
+	optional ...int) *P1 {
+	var x *Fp
+	var affine C.bool
+	if pointIf != nil {
+		switch val := pointIf.(type) {
+		case *P1:
+			x = &val.x
+			affine = false
+		case *P1Affine:
+			x = &val.x
+			affine = true
+		default:
+			panic(fmt.Sprintf("unsupported type %T", val))
+		}
+	}
+	var nbits int
+	var scalar *C.byte
+	switch val := scalarIf.(type) {
+	case []byte:
+		scalar = (*C.byte)(&val[0])
+		nbits = len(val) * 8
+	case *Scalar:
+		scalar = &val.b[0]
+		nbits = 255
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	if len(optional) > 0 {
+		nbits = optional[0]
+	}
+	C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits))
+	return acc
+}
+
+//
+// Affine
+//
+
+func (p *P1) ToAffine() *P1Affine {
+	var pa P1Affine
+	C.blst_p1_to_affine(&pa, p)
+	return &pa
+}
+
+func (p *P1) FromAffine(pa *P1Affine) {
+	C.blst_p1_from_affine(p, pa)
+}
+
+// Hash
+func HashToG1(msg []byte, dst []byte,
+	optional ...[]byte) *P1 { // aug
+	var q P1
+
+	// Handle zero length message
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	var aug []byte
+	var augC *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			augC = (*C.byte)(&aug[0])
+		}
+	}
+
+	C.blst_hash_to_g1(&q, msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst)),
+		augC, C.size_t(len(aug)))
+	return &q
+}
+
+func EncodeToG1(msg []byte, dst []byte,
+	optional ...[]byte) *P1 { // aug
+	var q P1
+
+	// Handle zero length message
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	var aug []byte
+	var augC *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			augC = (*C.byte)(&aug[0])
+		}
+	}
+
+	C.blst_encode_to_g1(&q, msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst)),
+		augC, C.size_t(len(aug)))
+	return &q
+}
+
+//
+// Multi-point/scalar operations
+//
+
+func P1sToAffine(points []*P1, optional ...int) P1Affines {
+	var npoints int
+	if len(optional) > 0 {
+		npoints = optional[0]
+	} else {
+		npoints = len(points)
+	}
+	ret := make([]P1Affine, npoints)
+	_cgoCheckPointer := func(...interface{}) {}
+	C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints))
+	return ret
+}
+
+func (points P1s) ToAffine(optional ...P1Affines) P1Affines {
+	npoints := len(points)
+	var ret P1Affines
+
+	if len(optional) > 0 { // used in benchmark
+		ret = optional[0]
+		if len(ret) < npoints {
+			panic("npoints mismatch")
+		}
+	} else {
+		ret = make([]P1Affine, npoints)
+	}
+
+	if maxProcs < 2 || npoints < 768 {
+		C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints))
+		return ret
+	}
+
+	nslices := (npoints + 511) / 512
+	if nslices > maxProcs {
+		nslices = maxProcs
+	}
+	delta, rem := npoints/nslices+1, npoints%nslices
+
+	var wg sync.WaitGroup
+	wg.Add(nslices)
+	for x := 0; x < npoints; x += delta {
+		if rem == 0 {
+			delta -= 1
+		}
+		rem -= 1
+		go func(out *P1Affine, inp *P1, delta int) {
+			C.go_p1slice_to_affine(out, inp, C.size_t(delta))
+			wg.Done()
+		}(&ret[x], &points[x], delta)
+	}
+	wg.Wait()
+
+	return ret
+}
+
+//
+// Batch addition
+//
+
+func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 {
+	var npoints int
+	if len(optional) > 0 {
+		npoints = optional[0]
+	} else {
+		npoints = len(points)
+	}
+	var ret P1
+	_cgoCheckPointer := func(...interface{}) {}
+	C.blst_p1s_add(&ret, &points[0], C.size_t(npoints))
+	return &ret
+}
+
+func (points P1Affines) Add() *P1 {
+	npoints := len(points)
+	if maxProcs < 2 || npoints < 768 {
+		var ret P1
+		C.go_p1slice_add(&ret, &points[0], C.size_t(npoints))
+		return &ret
+	}
+
+	nslices := (npoints + 511) / 512
+	if nslices > maxProcs {
+		nslices = maxProcs
+	}
+	delta, rem := npoints/nslices+1, npoints%nslices
+
+	msgs := make(chan P1, nslices)
+	for x := 0; x < npoints; x += delta {
+		if rem == 0 {
+			delta -= 1
+		}
+		rem -= 1
+		go func(points *P1Affine, delta int) {
+			var ret P1
+			C.go_p1slice_add(&ret, points, C.size_t(delta))
+			msgs <- ret
+		}(&points[x], delta)
+	}
+
+	ret := <-msgs
+	for i := 1; i < nslices; i++ {
+		msg := <-msgs
+		C.blst_p1_add_or_double(&ret, &ret, &msg)
+	}
+	return &ret
+}
+
+func (points P1s) Add() *P1 {
+	return points.ToAffine().Add()
+}
+
+//
+// Multi-scalar multiplication
+//
+
+func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 {
+	var npoints int
+	switch val := pointsIf.(type) {
+	case []*P1Affine:
+		npoints = len(val)
+	case []P1Affine:
+		npoints = len(val)
+	case P1Affines:
+		npoints = len(val)
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+
+	nbytes := (nbits + 7) / 8
+	var scalars []*C.byte
+	switch val := scalarsIf.(type) {
+	case []byte:
+		if len(val) < npoints*nbytes {
+			return nil
+		}
+	case [][]byte:
+		if len(val) < npoints {
+			return nil
+		}
+		scalars = make([]*C.byte, npoints)
+		for i := range scalars {
+			scalars[i] = (*C.byte)(&val[i][0])
+		}
+	case []Scalar:
+		if len(val) < npoints {
+			return nil
+		}
+		if nbits <= 248 {
+			scalars = make([]*C.byte, npoints)
+			for i := range scalars {
+				scalars[i] = &val[i].b[0]
+			}
+		}
+	case []*Scalar:
+		if len(val) < npoints {
+			return nil
+		}
+		scalars = make([]*C.byte, npoints)
+		for i := range scalars {
+			scalars[i] = &val[i].b[0]
+		}
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+
+	numThreads := maxProcs
+	numCores := runtime.GOMAXPROCS(0)
+	if numCores < maxProcs {
+		numThreads = numCores
+	}
+
+	if numThreads < 2 || npoints < 32 {
+		sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8
+		scratch := make([]uint64, sz)
+
+		pointsBySlice := [2]*P1Affine{nil, nil}
+		var p_points **P1Affine
+		switch val := pointsIf.(type) {
+		case []*P1Affine:
+			p_points = &val[0]
+		case []P1Affine:
+			pointsBySlice[0] = &val[0]
+			p_points = &pointsBySlice[0]
+		case P1Affines:
+			pointsBySlice[0] = &val[0]
+			p_points = &pointsBySlice[0]
+		}
+
+		scalarsBySlice := [2]*C.byte{nil, nil}
+		var p_scalars **C.byte
+		switch val := scalarsIf.(type) {
+		case []byte:
+			scalarsBySlice[0] = (*C.byte)(&val[0])
+			p_scalars = &scalarsBySlice[0]
+		case [][]byte:
+			p_scalars = &scalars[0]
+		case []Scalar:
+			if nbits > 248 {
+				scalarsBySlice[0] = (*C.byte)(&val[0].b[0])
+				p_scalars = &scalarsBySlice[0]
+			} else {
+				p_scalars = &scalars[0]
+			}
+		case []*Scalar:
+			p_scalars = &scalars[0]
+		}
+
+		var ret P1
+		_cgoCheckPointer := func(...interface{}) {}
+		C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints),
+			p_scalars, C.size_t(nbits),
+			(*C.limb_t)(&scratch[0]))
+
+		for i := range scalars {
+			scalars[i] = nil
+		}
+
+		return &ret
+	}
+
+	// this is sizeof(scratch[0])
+	sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8
+
+	nx, ny, window := breakdown(nbits, pippenger_window_size(npoints),
+		numThreads)
+
+	// |grid[]| holds "coordinates" and place for result
+	grid := make([]struct {
+		x, dx, y, dy int
+		point        P1
+	}, nx*ny)
+
+	dx := npoints / nx
+	y := window * (ny - 1)
+	total := 0
+	for ; total < nx; total++ {
+		grid[total].x = total * dx
+		grid[total].dx = dx
+		grid[total].y = y
+		grid[total].dy = nbits - y
+	}
+	grid[total-1].dx = npoints - grid[total-1].x
+
+	for y > 0 {
+		y -= window
+		for i := 0; i < nx; i++ {
+			grid[total].x = grid[i].x
+			grid[total].dx = grid[i].dx
+			grid[total].y = y
+			grid[total].dy = window
+			total++
+		}
+	}
+
+	if numThreads > total {
+		numThreads = total
+	}
+
+	msgsCh := make(chan int, ny)
+	rowSync := make([]int32, ny) // count up to |nx|
+	curItem := int32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			scratch := make([]uint64, sz<<uint(window-1))
+			pointsBySlice := [2]*P1Affine{nil, nil}
+			scalarsBySlice := [2]*C.byte{nil, nil}
+			_cgoCheckPointer := func(...interface{}) {}
+
+			for {
+				workItem := atomic.AddInt32(&curItem, 1) - 1
+				if int(workItem) >= total {
+					break
+				}
+
+				x := grid[workItem].x
+				y := grid[workItem].y
+
+				var p_points **P1Affine
+				switch val := pointsIf.(type) {
+				case []*P1Affine:
+					p_points = &val[x]
+				case []P1Affine:
+					pointsBySlice[0] = &val[x]
+					p_points = &pointsBySlice[0]
+				case P1Affines:
+					pointsBySlice[0] = &val[x]
+					p_points = &pointsBySlice[0]
+				}
+
+				var p_scalars **C.byte
+				switch val := scalarsIf.(type) {
+				case []byte:
+					scalarsBySlice[0] = (*C.byte)(&val[x*nbytes])
+					p_scalars = &scalarsBySlice[0]
+				case [][]byte:
+					p_scalars = &scalars[x]
+				case []Scalar:
+					if nbits > 248 {
+						scalarsBySlice[0] = (*C.byte)(&val[x].b[0])
+						p_scalars = &scalarsBySlice[0]
+					} else {
+						p_scalars = &scalars[x]
+					}
+				case []*Scalar:
+					p_scalars = &scalars[x]
+				}
+
+				C.blst_p1s_tile_pippenger(&grid[workItem].point,
+					p_points, C.size_t(grid[workItem].dx),
+					p_scalars, C.size_t(nbits),
+					(*C.limb_t)(&scratch[0]),
+					C.size_t(y), C.size_t(window))
+
+				if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) {
+					msgsCh <- y // "row" is done
+				} else {
+					runtime.Gosched() // be nice to the application
+				}
+			}
+
+			pointsBySlice[0] = nil
+			scalarsBySlice[0] = nil
+		}()
+	}
+
+	var ret P1
+	rows := make([]bool, ny)
+	row := 0                  // actually index in |grid[]|
+	for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row"
+		y := <-msgsCh
+		rows[y/window] = true  // mark the "row"
+		for grid[row].y == y { // if it's current "row", process it
+			for row < total && grid[row].y == y {
+				C.blst_p1_add_or_double(&ret, &ret, &grid[row].point)
+				row++
+			}
+			if y == 0 {
+				break // one can as well 'return &ret' here
+			}
+			for j := 0; j < window; j++ {
+				C.blst_p1_double(&ret, &ret)
+			}
+			y -= window
+			if !rows[y/window] { // see if next "row" was marked already
+				break
+			}
+		}
+	}
+
+	for i := range scalars {
+		scalars[i] = nil
+	}
+
+	return &ret
+}
+
+func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 {
+	return P1AffinesMult(points, scalarsIf, nbits)
+}
+
+func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 {
+	return points.ToAffine().Mult(scalarsIf, nbits)
+}
+func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool,
+	sig *P1Affine, sigGroupcheck bool, msg []byte,
+	optional ...[]byte) int { // aug
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0],
+		PK, C.bool(pkValidate),
+		sig, C.bool(sigGroupcheck),
+		umsg, C.size_t(len(msg)),
+		uaug, C.size_t(len(aug)))
+
+	return int(r)
+}
+
+func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool,
+	sig *P1Affine, sigGroupcheck bool,
+	rand *Scalar, randBits int, msg []byte,
+	optional ...[]byte) int { // aug
+	var aug []byte
+	var uaug *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			uaug = (*C.byte)(&aug[0])
+		}
+	}
+	var umsg *C.byte
+	if len(msg) > 0 {
+		umsg = (*C.byte)(&msg[0])
+	}
+
+	r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0],
+		PK, C.bool(pkValidate),
+		sig, C.bool(sigGroupcheck),
+		&rand.b[0], C.size_t(randBits),
+		umsg, C.size_t(len(msg)),
+		uaug, C.size_t(len(aug)))
+
+	return int(r)
+}
+
+//
+// Serialization/Deserialization.
+//
+
+// P2 Serdes
+func (p2 *P2Affine) Serialize() []byte {
+	var out [BLST_P2_SERIALIZE_BYTES]byte
+	C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2)
+	return out[:]
+}
+
+func (p2 *P2Affine) Deserialize(in []byte) *P2Affine {
+	if len(in) != BLST_P2_SERIALIZE_BYTES {
+		return nil
+	}
+	if C.blst_p2_deserialize(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS {
+		return nil
+	}
+	return p2
+}
+func (p2 *P2Affine) Compress() []byte {
+	var out [BLST_P2_COMPRESS_BYTES]byte
+	C.blst_p2_affine_compress((*C.byte)(&out[0]), p2)
+	return out[:]
+}
+
+func (p2 *P2Affine) Uncompress(in []byte) *P2Affine {
+	if len(in) != BLST_P2_COMPRESS_BYTES {
+		return nil
+	}
+	if C.blst_p2_uncompress(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS {
+		return nil
+	}
+	return p2
+}
+
+func (p2 *P2Affine) InG2() bool {
+	return bool(C.blst_p2_affine_in_g2(p2))
+}
+
+func (dummy *P2Affine) BatchUncompress(in [][]byte) []*P2Affine {
+	// Allocate space for all of the resulting points. Later we'll save pointers
+	// and return those so that the result could be used in other functions,
+	// such as MultipleAggregateVerify.
+	n := len(in)
+	points := make([]P2Affine, n)
+	pointsPtrs := make([]*P2Affine, n)
+
+	numCores := runtime.GOMAXPROCS(0)
+	numThreads := maxProcs
+	if numThreads > numCores {
+		numThreads = numCores
+	}
+	if numThreads > n {
+		numThreads = n
+	}
+	// Each thread will determine next message to process by atomically
+	// incrementing curItem, process corresponding point, and
+	// repeat until n is exceeded. Each thread will send a result (true for
+	// success, false for failure) into the channel when complete.
+	resCh := make(chan bool, numThreads)
+	valid := int32(1)
+	curItem := uint32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			for atomic.LoadInt32(&valid) > 0 {
+				// Get a work item
+				work := atomic.AddUint32(&curItem, 1) - 1
+				if work >= uint32(n) {
+					break
+				}
+				if points[work].Uncompress(in[work]) == nil {
+					atomic.StoreInt32(&valid, 0)
+					break
+				}
+				pointsPtrs[work] = &points[work]
+			}
+			if atomic.LoadInt32(&valid) > 0 {
+				resCh <- true
+			} else {
+				resCh <- false
+			}
+		}()
+	}
+
+	// Collect the threads
+	result := true
+	for i := 0; i < numThreads; i++ {
+		if !<-resCh {
+			result = false
+		}
+	}
+	if atomic.LoadInt32(&valid) == 0 || !result {
+		return nil
+	}
+	return pointsPtrs
+}
+
+func (p2 *P2) Serialize() []byte {
+	var out [BLST_P2_SERIALIZE_BYTES]byte
+	C.blst_p2_serialize((*C.byte)(&out[0]), p2)
+	return out[:]
+}
+func (p2 *P2) Compress() []byte {
+	var out [BLST_P2_COMPRESS_BYTES]byte
+	C.blst_p2_compress((*C.byte)(&out[0]), p2)
+	return out[:]
+}
+
+func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 {
+	var nbits int
+	var scalar *C.byte
+	switch val := scalarIf.(type) {
+	case []byte:
+		scalar = (*C.byte)(&val[0])
+		nbits = len(val) * 8
+	case *Scalar:
+		scalar = &val.b[0]
+		nbits = 255
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	if len(optional) > 0 {
+		nbits = optional[0]
+	}
+	C.blst_p2_mult(p2, p2, scalar, C.size_t(nbits))
+	return p2
+}
+
+func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 {
+	ret := *p2
+	return ret.MultAssign(scalarIf, optional...)
+}
+
+func (p2 *P2) AddAssign(pointIf interface{}) *P2 {
+	switch val := pointIf.(type) {
+	case *P2:
+		C.blst_p2_add_or_double(p2, p2, val)
+	case *P2Affine:
+		C.blst_p2_add_or_double_affine(p2, p2, val)
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	return p2
+}
+
+func (p2 *P2) Add(pointIf interface{}) *P2 {
+	ret := *p2
+	return ret.AddAssign(pointIf)
+}
+
+func (p2 *P2) SubAssign(pointIf interface{}) *P2 {
+	var x *Fp2
+	var affine C.bool
+	switch val := pointIf.(type) {
+	case *P2:
+		x = &val.x
+		affine = false
+	case *P2Affine:
+		x = &val.x
+		affine = true
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	C.go_p2_sub_assign(p2, x, affine)
+	return p2
+}
+
+func (p2 *P2) Sub(pointIf interface{}) *P2 {
+	ret := *p2
+	return ret.SubAssign(pointIf)
+}
+
+func P2Generator() *P2 {
+	return C.blst_p2_generator()
+}
+
+// 'acc += point * scalar', passing 'nil' for 'point' means "use the
+//
+//	group generator point"
+func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{},
+	optional ...int) *P2 {
+	var x *Fp2
+	var affine C.bool
+	if pointIf != nil {
+		switch val := pointIf.(type) {
+		case *P2:
+			x = &val.x
+			affine = false
+		case *P2Affine:
+			x = &val.x
+			affine = true
+		default:
+			panic(fmt.Sprintf("unsupported type %T", val))
+		}
+	}
+	var nbits int
+	var scalar *C.byte
+	switch val := scalarIf.(type) {
+	case []byte:
+		scalar = (*C.byte)(&val[0])
+		nbits = len(val) * 8
+	case *Scalar:
+		scalar = &val.b[0]
+		nbits = 255
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+	if len(optional) > 0 {
+		nbits = optional[0]
+	}
+	C.go_p2_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits))
+	return acc
+}
+
+//
+// Affine
+//
+
+func (p *P2) ToAffine() *P2Affine {
+	var pa P2Affine
+	C.blst_p2_to_affine(&pa, p)
+	return &pa
+}
+
+func (p *P2) FromAffine(pa *P2Affine) {
+	C.blst_p2_from_affine(p, pa)
+}
+
+// Hash
+func HashToG2(msg []byte, dst []byte,
+	optional ...[]byte) *P2 { // aug
+	var q P2
+
+	// Handle zero length message
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	var aug []byte
+	var augC *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			augC = (*C.byte)(&aug[0])
+		}
+	}
+
+	C.blst_hash_to_g2(&q, msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst)),
+		augC, C.size_t(len(aug)))
+	return &q
+}
+
+func EncodeToG2(msg []byte, dst []byte,
+	optional ...[]byte) *P2 { // aug
+	var q P2
+
+	// Handle zero length message
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	var aug []byte
+	var augC *C.byte
+	if len(optional) > 0 {
+		aug = optional[0]
+		if len(aug) > 0 {
+			augC = (*C.byte)(&aug[0])
+		}
+	}
+
+	C.blst_encode_to_g2(&q, msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst)),
+		augC, C.size_t(len(aug)))
+	return &q
+}
+
+//
+// Multi-point/scalar operations
+//
+
+func P2sToAffine(points []*P2, optional ...int) P2Affines {
+	var npoints int
+	if len(optional) > 0 {
+		npoints = optional[0]
+	} else {
+		npoints = len(points)
+	}
+	ret := make([]P2Affine, npoints)
+	_cgoCheckPointer := func(...interface{}) {}
+	C.blst_p2s_to_affine(&ret[0], &points[0], C.size_t(npoints))
+	return ret
+}
+
+func (points P2s) ToAffine(optional ...P2Affines) P2Affines {
+	npoints := len(points)
+	var ret P2Affines
+
+	if len(optional) > 0 { // used in benchmark
+		ret = optional[0]
+		if len(ret) < npoints {
+			panic("npoints mismatch")
+		}
+	} else {
+		ret = make([]P2Affine, npoints)
+	}
+
+	if maxProcs < 2 || npoints < 768 {
+		C.go_p2slice_to_affine(&ret[0], &points[0], C.size_t(npoints))
+		return ret
+	}
+
+	nslices := (npoints + 511) / 512
+	if nslices > maxProcs {
+		nslices = maxProcs
+	}
+	delta, rem := npoints/nslices+1, npoints%nslices
+
+	var wg sync.WaitGroup
+	wg.Add(nslices)
+	for x := 0; x < npoints; x += delta {
+		if rem == 0 {
+			delta -= 1
+		}
+		rem -= 1
+		go func(out *P2Affine, inp *P2, delta int) {
+			C.go_p2slice_to_affine(out, inp, C.size_t(delta))
+			wg.Done()
+		}(&ret[x], &points[x], delta)
+	}
+	wg.Wait()
+
+	return ret
+}
+
+//
+// Batch addition
+//
+
+func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 {
+	var npoints int
+	if len(optional) > 0 {
+		npoints = optional[0]
+	} else {
+		npoints = len(points)
+	}
+	var ret P2
+	_cgoCheckPointer := func(...interface{}) {}
+	C.blst_p2s_add(&ret, &points[0], C.size_t(npoints))
+	return &ret
+}
+
+func (points P2Affines) Add() *P2 {
+	npoints := len(points)
+	if maxProcs < 2 || npoints < 768 {
+		var ret P2
+		C.go_p2slice_add(&ret, &points[0], C.size_t(npoints))
+		return &ret
+	}
+
+	nslices := (npoints + 511) / 512
+	if nslices > maxProcs {
+		nslices = maxProcs
+	}
+	delta, rem := npoints/nslices+1, npoints%nslices
+
+	msgs := make(chan P2, nslices)
+	for x := 0; x < npoints; x += delta {
+		if rem == 0 {
+			delta -= 1
+		}
+		rem -= 1
+		go func(points *P2Affine, delta int) {
+			var ret P2
+			C.go_p2slice_add(&ret, points, C.size_t(delta))
+			msgs <- ret
+		}(&points[x], delta)
+	}
+
+	ret := <-msgs
+	for i := 1; i < nslices; i++ {
+		msg := <-msgs
+		C.blst_p2_add_or_double(&ret, &ret, &msg)
+	}
+	return &ret
+}
+
+func (points P2s) Add() *P2 {
+	return points.ToAffine().Add()
+}
+
+//
+// Multi-scalar multiplication
+//
+
+func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 {
+	var npoints int
+	switch val := pointsIf.(type) {
+	case []*P2Affine:
+		npoints = len(val)
+	case []P2Affine:
+		npoints = len(val)
+	case P2Affines:
+		npoints = len(val)
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+
+	nbytes := (nbits + 7) / 8
+	var scalars []*C.byte
+	switch val := scalarsIf.(type) {
+	case []byte:
+		if len(val) < npoints*nbytes {
+			return nil
+		}
+	case [][]byte:
+		if len(val) < npoints {
+			return nil
+		}
+		scalars = make([]*C.byte, npoints)
+		for i := range scalars {
+			scalars[i] = (*C.byte)(&val[i][0])
+		}
+	case []Scalar:
+		if len(val) < npoints {
+			return nil
+		}
+		if nbits <= 248 {
+			scalars = make([]*C.byte, npoints)
+			for i := range scalars {
+				scalars[i] = &val[i].b[0]
+			}
+		}
+	case []*Scalar:
+		if len(val) < npoints {
+			return nil
+		}
+		scalars = make([]*C.byte, npoints)
+		for i := range scalars {
+			scalars[i] = &val[i].b[0]
+		}
+	default:
+		panic(fmt.Sprintf("unsupported type %T", val))
+	}
+
+	numThreads := maxProcs
+	numCores := runtime.GOMAXPROCS(0)
+	if numCores < maxProcs {
+		numThreads = numCores
+	}
+
+	if numThreads < 2 || npoints < 32 {
+		sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8
+		scratch := make([]uint64, sz)
+
+		pointsBySlice := [2]*P2Affine{nil, nil}
+		var p_points **P2Affine
+		switch val := pointsIf.(type) {
+		case []*P2Affine:
+			p_points = &val[0]
+		case []P2Affine:
+			pointsBySlice[0] = &val[0]
+			p_points = &pointsBySlice[0]
+		case P2Affines:
+			pointsBySlice[0] = &val[0]
+			p_points = &pointsBySlice[0]
+		}
+
+		scalarsBySlice := [2]*C.byte{nil, nil}
+		var p_scalars **C.byte
+		switch val := scalarsIf.(type) {
+		case []byte:
+			scalarsBySlice[0] = (*C.byte)(&val[0])
+			p_scalars = &scalarsBySlice[0]
+		case [][]byte:
+			p_scalars = &scalars[0]
+		case []Scalar:
+			if nbits > 248 {
+				scalarsBySlice[0] = (*C.byte)(&val[0].b[0])
+				p_scalars = &scalarsBySlice[0]
+			} else {
+				p_scalars = &scalars[0]
+			}
+		case []*Scalar:
+			p_scalars = &scalars[0]
+		}
+
+		var ret P2
+		_cgoCheckPointer := func(...interface{}) {}
+		C.blst_p2s_mult_pippenger(&ret, p_points, C.size_t(npoints),
+			p_scalars, C.size_t(nbits),
+			(*C.limb_t)(&scratch[0]))
+
+		for i := range scalars {
+			scalars[i] = nil
+		}
+
+		return &ret
+	}
+
+	// this is sizeof(scratch[0])
+	sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8
+
+	nx, ny, window := breakdown(nbits, pippenger_window_size(npoints),
+		numThreads)
+
+	// |grid[]| holds "coordinates" and place for result
+	grid := make([]struct {
+		x, dx, y, dy int
+		point        P2
+	}, nx*ny)
+
+	dx := npoints / nx
+	y := window * (ny - 1)
+	total := 0
+	for ; total < nx; total++ {
+		grid[total].x = total * dx
+		grid[total].dx = dx
+		grid[total].y = y
+		grid[total].dy = nbits - y
+	}
+	grid[total-1].dx = npoints - grid[total-1].x
+
+	for y > 0 {
+		y -= window
+		for i := 0; i < nx; i++ {
+			grid[total].x = grid[i].x
+			grid[total].dx = grid[i].dx
+			grid[total].y = y
+			grid[total].dy = window
+			total++
+		}
+	}
+
+	if numThreads > total {
+		numThreads = total
+	}
+
+	msgsCh := make(chan int, ny)
+	rowSync := make([]int32, ny) // count up to |nx|
+	curItem := int32(0)
+	for tid := 0; tid < numThreads; tid++ {
+		go func() {
+			scratch := make([]uint64, sz<<uint(window-1))
+			pointsBySlice := [2]*P2Affine{nil, nil}
+			scalarsBySlice := [2]*C.byte{nil, nil}
+			_cgoCheckPointer := func(...interface{}) {}
+
+			for {
+				workItem := atomic.AddInt32(&curItem, 1) - 1
+				if int(workItem) >= total {
+					break
+				}
+
+				x := grid[workItem].x
+				y := grid[workItem].y
+
+				var p_points **P2Affine
+				switch val := pointsIf.(type) {
+				case []*P2Affine:
+					p_points = &val[x]
+				case []P2Affine:
+					pointsBySlice[0] = &val[x]
+					p_points = &pointsBySlice[0]
+				case P2Affines:
+					pointsBySlice[0] = &val[x]
+					p_points = &pointsBySlice[0]
+				}
+
+				var p_scalars **C.byte
+				switch val := scalarsIf.(type) {
+				case []byte:
+					scalarsBySlice[0] = (*C.byte)(&val[x*nbytes])
+					p_scalars = &scalarsBySlice[0]
+				case [][]byte:
+					p_scalars = &scalars[x]
+				case []Scalar:
+					if nbits > 248 {
+						scalarsBySlice[0] = (*C.byte)(&val[x].b[0])
+						p_scalars = &scalarsBySlice[0]
+					} else {
+						p_scalars = &scalars[x]
+					}
+				case []*Scalar:
+					p_scalars = &scalars[x]
+				}
+
+				C.blst_p2s_tile_pippenger(&grid[workItem].point,
+					p_points, C.size_t(grid[workItem].dx),
+					p_scalars, C.size_t(nbits),
+					(*C.limb_t)(&scratch[0]),
+					C.size_t(y), C.size_t(window))
+
+				if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) {
+					msgsCh <- y // "row" is done
+				} else {
+					runtime.Gosched() // be nice to the application
+				}
+			}
+
+			pointsBySlice[0] = nil
+			scalarsBySlice[0] = nil
+		}()
+	}
+
+	var ret P2
+	rows := make([]bool, ny)
+	row := 0                  // actually index in |grid[]|
+	for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row"
+		y := <-msgsCh
+		rows[y/window] = true  // mark the "row"
+		for grid[row].y == y { // if it's current "row", process it
+			for row < total && grid[row].y == y {
+				C.blst_p2_add_or_double(&ret, &ret, &grid[row].point)
+				row++
+			}
+			if y == 0 {
+				break // one can as well 'return &ret' here
+			}
+			for j := 0; j < window; j++ {
+				C.blst_p2_double(&ret, &ret)
+			}
+			y -= window
+			if !rows[y/window] { // see if next "row" was marked already
+				break
+			}
+		}
+	}
+
+	for i := range scalars {
+		scalars[i] = nil
+	}
+
+	return &ret
+}
+
+func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 {
+	return P2AffinesMult(points, scalarsIf, nbits)
+}
+
+func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 {
+	return points.ToAffine().Mult(scalarsIf, nbits)
+}
+
+func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) {
+	var aug [][]byte     // For aggregate verify
+	var augSingle []byte // For signing
+	useHash := true      // hash (true), encode (false)
+
+	for _, arg := range optional {
+		switch v := arg.(type) {
+		case []byte:
+			augSingle = v
+		case [][]byte:
+			aug = v
+		case bool:
+			useHash = v
+		default:
+			return nil, nil, useHash, false
+		}
+	}
+	return augSingle, aug, useHash, true
+}
+
+func bytesAllZero(s []byte) bool {
+	for _, v := range s {
+		if v != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// These methods are inefficient because of cgo call overhead. For this
+// reason they should be used primarily for prototyping with a goal to
+// formulate interfaces that would process multiple scalars per cgo call.
+func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) {
+	return a, bool(C.blst_sk_mul_n_check(a, a, b))
+}
+
+func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) {
+	var ret Scalar
+	return &ret, bool(C.blst_sk_mul_n_check(&ret, a, b))
+}
+
+func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) {
+	return a, bool(C.blst_sk_add_n_check(a, a, b))
+}
+
+func (a *Scalar) Add(b *Scalar) (*Scalar, bool) {
+	var ret Scalar
+	return &ret, bool(C.blst_sk_add_n_check(&ret, a, b))
+}
+
+func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) {
+	return a, bool(C.blst_sk_sub_n_check(a, a, b))
+}
+
+func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) {
+	var ret Scalar
+	return &ret, bool(C.blst_sk_sub_n_check(&ret, a, b))
+}
+
+func (a *Scalar) Inverse() *Scalar {
+	var ret Scalar
+	C.blst_sk_inverse(&ret, a)
+	return &ret
+}
+
+//
+// Serialization/Deserialization.
+//
+
+// Scalar serdes
+func (s *Scalar) Serialize() []byte {
+	var out [BLST_SCALAR_BYTES]byte
+	C.blst_bendian_from_scalar((*C.byte)(&out[0]), s)
+	return out[:]
+}
+
+func (s *Scalar) Deserialize(in []byte) *Scalar {
+	if len(in) != BLST_SCALAR_BYTES ||
+		!C.go_scalar_from_bendian(s, (*C.byte)(&in[0])) {
+		return nil
+	}
+	return s
+}
+
+func (s *Scalar) Valid() bool {
+	return bool(C.blst_sk_check(s))
+}
+
+func (s *Scalar) HashTo(msg []byte, dst []byte) bool {
+	ret := HashToScalar(msg, dst)
+	if ret != nil {
+		*s = *ret
+		return true
+	}
+	return false
+}
+
+func HashToScalar(msg []byte, dst []byte) *Scalar {
+	var ret Scalar
+
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	if C.go_hash_to_scalar(&ret, msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst))) {
+		return &ret
+	}
+
+	return nil
+}
+
+//
+// LEndian
+//
+
+func (fr *Scalar) ToLEndian() []byte {
+	var arr [BLST_SCALAR_BYTES]byte
+	C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr)
+	return arr[:]
+}
+
+func (fp *Fp) ToLEndian() []byte {
+	var arr [BLST_FP_BYTES]byte
+	C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp)
+	return arr[:]
+}
+
+func (fr *Scalar) FromLEndian(arr []byte) *Scalar {
+	nbytes := len(arr)
+	if nbytes < BLST_SCALAR_BYTES ||
+		!C.blst_scalar_from_le_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) {
+		return nil
+	}
+	return fr
+}
+
+func (fp *Fp) FromLEndian(arr []byte) *Fp {
+	if len(arr) != BLST_FP_BYTES {
+		return nil
+	}
+	C.blst_fp_from_lendian(fp, (*C.byte)(&arr[0]))
+	return fp
+}
+
+//
+// BEndian
+//
+
+func (fr *Scalar) ToBEndian() []byte {
+	var arr [BLST_SCALAR_BYTES]byte
+	C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr)
+	return arr[:]
+}
+
+func (fp *Fp) ToBEndian() []byte {
+	var arr [BLST_FP_BYTES]byte
+	C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp)
+	return arr[:]
+}
+
+func (fr *Scalar) FromBEndian(arr []byte) *Scalar {
+	nbytes := len(arr)
+	if nbytes < BLST_SCALAR_BYTES ||
+		!C.blst_scalar_from_be_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) {
+		return nil
+	}
+	return fr
+}
+
+func (fp *Fp) FromBEndian(arr []byte) *Fp {
+	if len(arr) != BLST_FP_BYTES {
+		return nil
+	}
+	C.blst_fp_from_bendian(fp, (*C.byte)(&arr[0]))
+	return fp
+}
+
+//
+// Printing
+//
+
+func PrintBytes(val []byte, name string) {
+	fmt.Printf("%s = %02x\n", name, val)
+}
+
+func (s *Scalar) Print(name string) {
+	arr := s.ToBEndian()
+	PrintBytes(arr[:], name)
+}
+
+func (p *P1Affine) Print(name string) {
+	fmt.Printf("%s:\n", name)
+	arr := p.x.ToBEndian()
+	PrintBytes(arr, "  x")
+	arr = p.y.ToBEndian()
+	PrintBytes(arr, "  y")
+}
+
+func (p *P1) Print(name string) {
+	fmt.Printf("%s:\n", name)
+	aff := p.ToAffine()
+	aff.Print(name)
+}
+
+func (f *Fp2) Print(name string) {
+	fmt.Printf("%s:\n", name)
+	arr := f.fp[0].ToBEndian()
+	PrintBytes(arr, "    0")
+	arr = f.fp[1].ToBEndian()
+	PrintBytes(arr, "    1")
+}
+
+func (p *P2Affine) Print(name string) {
+	fmt.Printf("%s:\n", name)
+	p.x.Print("  x")
+	p.y.Print("  y")
+}
+
+func (p *P2) Print(name string) {
+	fmt.Printf("%s:\n", name)
+	aff := p.ToAffine()
+	aff.Print(name)
+}
+
+//
+// Equality
+//
+
+func (s1 *Scalar) Equals(s2 *Scalar) bool {
+	return *s1 == *s2
+}
+
+func (e1 *Fp) Equals(e2 *Fp) bool {
+	return *e1 == *e2
+}
+
+func (e1 *Fp2) Equals(e2 *Fp2) bool {
+	return *e1 == *e2
+}
+
+func (e1 *P1Affine) Equals(e2 *P1Affine) bool {
+	return bool(C.blst_p1_affine_is_equal(e1, e2))
+}
+
+func (e1 *P1) Equals(e2 *P1) bool {
+	return bool(C.blst_p1_is_equal(e1, e2))
+}
+
+func (e1 *P2Affine) Equals(e2 *P2Affine) bool {
+	return bool(C.blst_p2_affine_is_equal(e1, e2))
+}
+
+func (e1 *P2) Equals(e2 *P2) bool {
+	return bool(C.blst_p2_is_equal(e1, e2))
+}
+
+// private thunk for testing
+
+func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte {
+	ret := make([]byte, len_in_bytes)
+
+	var msgC *C.byte
+	if len(msg) > 0 {
+		msgC = (*C.byte)(&msg[0])
+	}
+
+	var dstC *C.byte
+	if len(dst) > 0 {
+		dstC = (*C.byte)(&dst[0])
+	}
+
+	C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)),
+		msgC, C.size_t(len(msg)),
+		dstC, C.size_t(len(dst)))
+	return ret
+}
+
+func breakdown(nbits, window, ncpus int) (int, int, int) {
+	var nx, ny, wnd int
+
+	if nbits > window*ncpus {
+		nx = 1
+		wnd = bits.Len(uint(ncpus) / 4)
+		if (window + wnd) > 18 {
+			wnd = window - wnd
+		} else {
+			wnd = (nbits/window + ncpus - 1) / ncpus
+			if (nbits/(window+1)+ncpus-1)/ncpus < wnd {
+				wnd = window + 1
+			} else {
+				wnd = window
+			}
+		}
+	} else {
+		nx = 2
+		wnd = window - 2
+		for (nbits/wnd+1)*nx < ncpus {
+			nx += 1
+			wnd = window - bits.Len(3*uint(nx)/2)
+		}
+		nx -= 1
+		wnd = window - bits.Len(3*uint(nx)/2)
+	}
+	ny = nbits/wnd + 1
+	wnd = nbits/ny + 1
+
+	return nx, ny, wnd
+}
+
+func pippenger_window_size(npoints int) int {
+	wbits := bits.Len(uint(npoints))
+
+	if wbits > 13 {
+		return wbits - 4
+	}
+	if wbits > 5 {
+		return wbits - 3
+	}
+	return 2
+}
diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h
new file mode 100644
index 00000000000..2e314b3a32e
--- /dev/null
+++ b/crypto/internal/blst/blst.h
@@ -0,0 +1,483 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLST_H__
+#define __BLST_H__
+
+#ifdef __SIZE_TYPE__
+typedef __SIZE_TYPE__ size_t;
+#else
+#include <stddef.h>
+#endif
+
+#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
+                            && defined(__UINT64_TYPE__)
+typedef __UINT8_TYPE__  uint8_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __UINT64_TYPE__ uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#elif defined(__BLST_CGO__)
+typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
+# define bool _Bool
+#else
+# define bool int
+#endif
+
+#ifdef SWIG
+# define DEFNULL =NULL
+#elif defined __cplusplus
+# define DEFNULL =0
+#else
+# define DEFNULL
+#endif
+
+typedef enum {
+    BLST_SUCCESS = 0,
+    BLST_BAD_ENCODING,
+    BLST_POINT_NOT_ON_CURVE,
+    BLST_POINT_NOT_IN_GROUP,
+    BLST_AGGR_TYPE_MISMATCH,
+    BLST_VERIFY_FAIL,
+    BLST_PK_IS_INFINITY,
+    BLST_BAD_SCALAR,
+} BLST_ERROR;
+
+typedef uint8_t byte;
+typedef uint64_t limb_t;
+
+typedef struct { byte b[256/8]; } blst_scalar;
+typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr;
+typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp;
+/* 0 is "real" part, 1 is "imaginary" */
+typedef struct { blst_fp fp[2]; } blst_fp2;
+typedef struct { blst_fp2 fp2[3]; } blst_fp6;
+typedef struct { blst_fp6 fp6[2]; } blst_fp12;
+
+void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]);
+void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a);
+void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]);
+void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a);
+void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]);
+void blst_bendian_from_scalar(byte out[32], const blst_scalar *a);
+void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]);
+void blst_lendian_from_scalar(byte out[32], const blst_scalar *a);
+bool blst_scalar_fr_check(const blst_scalar *a);
+bool blst_sk_check(const blst_scalar *a);
+bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a,
+                                           const blst_scalar *b);
+void blst_sk_inverse(blst_scalar *out, const blst_scalar *a);
+bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len);
+bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len);
+
+#ifndef SWIG
+/*
+ * BLS12-381-specific Fr operations.
+ */
+void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a);
+void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count);
+void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b);
+void blst_fr_sqr(blst_fr *ret, const blst_fr *a);
+void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag);
+void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a);
+void blst_fr_inverse(blst_fr *ret, const blst_fr *a);
+#ifdef BLST_FR_PENTAROOT
+void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a);
+void blst_fr_pentapow(blst_fr *ret, const blst_fr *a);
+#endif
+
+void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]);
+void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a);
+void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a);
+void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a);
+
+/*
+ * BLS12-381-specific Fp operations.
+ */
+void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a);
+void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a);
+void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count);
+void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b);
+void blst_fp_sqr(blst_fp *ret, const blst_fp *a);
+void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag);
+void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a);
+void blst_fp_inverse(blst_fp *ret, const blst_fp *a);
+bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a);
+
+void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]);
+void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a);
+void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]);
+void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a);
+void blst_fp_from_bendian(blst_fp *ret, const byte a[48]);
+void blst_bendian_from_fp(byte ret[48], const blst_fp *a);
+void blst_fp_from_lendian(blst_fp *ret, const byte a[48]);
+void blst_lendian_from_fp(byte ret[48], const blst_fp *a);
+
+/*
+ * BLS12-381-specific Fp2 operations.
+ */
+void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count);
+void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b);
+void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag);
+void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a);
+void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a);
+bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a);
+
+/*
+ * BLS12-381-specific Fp12 operations.
+ */
+void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a);
+void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b);
+void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a,
+                                             const blst_fp6 *xy00z0);
+void blst_fp12_conjugate(blst_fp12 *a);
+void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a);
+/* caveat lector! |n| has to be non-zero and not more than 3! */
+void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n);
+bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b);
+bool blst_fp12_is_one(const blst_fp12 *a);
+bool blst_fp12_in_group(const blst_fp12 *a);
+const blst_fp12 *blst_fp12_one(void);
+#endif  // SWIG
+
+/*
+ * BLS12-381-specific point operations.
+ */
+typedef struct { blst_fp x, y, z; } blst_p1;
+typedef struct { blst_fp x, y; } blst_p1_affine;
+
+void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b);
+void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a,
+                                      const blst_p1_affine *b);
+void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a,
+                                                const blst_p1_affine *b);
+void blst_p1_double(blst_p1 *out, const blst_p1 *a);
+void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p1_cneg(blst_p1 *p, bool cbit);
+void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in);
+void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in);
+bool blst_p1_on_curve(const blst_p1 *p);
+bool blst_p1_in_g1(const blst_p1 *p);
+bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b);
+bool blst_p1_is_inf(const blst_p1 *a);
+const blst_p1 *blst_p1_generator(void);
+
+bool blst_p1_affine_on_curve(const blst_p1_affine *p);
+bool blst_p1_affine_in_g1(const blst_p1_affine *p);
+bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b);
+bool blst_p1_affine_is_inf(const blst_p1_affine *a);
+const blst_p1_affine *blst_p1_affine_generator(void);
+
+typedef struct { blst_fp2 x, y, z; } blst_p2;
+typedef struct { blst_fp2 x, y; } blst_p2_affine;
+
+void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b);
+void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a,
+                                      const blst_p2_affine *b);
+void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a,
+                                                const blst_p2_affine *b);
+void blst_p2_double(blst_p2 *out, const blst_p2 *a);
+void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                  size_t nbits);
+void blst_p2_cneg(blst_p2 *p, bool cbit);
+void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in);
+void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in);
+bool blst_p2_on_curve(const blst_p2 *p);
+bool blst_p2_in_g2(const blst_p2 *p);
+bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b);
+bool blst_p2_is_inf(const blst_p2 *a);
+const blst_p2 *blst_p2_generator(void);
+
+bool blst_p2_affine_on_curve(const blst_p2_affine *p);
+bool blst_p2_affine_in_g2(const blst_p2_affine *p);
+bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b);
+bool blst_p2_affine_is_inf(const blst_p2_affine *a);
+const blst_p2_affine *blst_p2_affine_generator(void);
+
+/*
+ * Multi-scalar multiplications and other multi-point operations.
+ */
+
+void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[],
+                        size_t npoints);
+void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits,
+                                    const blst_p1_affine *const points[],
+                                    size_t npoints);
+size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[],
+                        size_t npoints);
+void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[],
+                                size_t npoints);
+
+size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints);
+void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits,
+                                    const blst_p2_affine *const points[],
+                                    size_t npoints);
+size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[],
+                         size_t wbits, size_t npoints,
+                         const byte *const scalars[], size_t nbits,
+                         limb_t *scratch);
+
+size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints);
+void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch);
+void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[],
+                             size_t npoints, const byte *const scalars[],
+                             size_t nbits, limb_t *scratch,
+                             size_t bit0, size_t window);
+
+/*
+ * Hash-to-curve operations.
+ */
+#ifndef SWIG
+void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL);
+void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL);
+#endif
+
+void blst_encode_to_g1(blst_p1 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g1(blst_p1 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+void blst_encode_to_g2(blst_p2 *out,
+                       const byte *msg, size_t msg_len,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                       const byte *aug DEFNULL, size_t aug_len DEFNULL);
+void blst_hash_to_g2(blst_p2 *out,
+                     const byte *msg, size_t msg_len,
+                     const byte *DST DEFNULL, size_t DST_len DEFNULL,
+                     const byte *aug DEFNULL, size_t aug_len DEFNULL);
+
+/*
+ * Zcash-compatible serialization/deserialization.
+ */
+void blst_p1_serialize(byte out[96], const blst_p1 *in);
+void blst_p1_compress(byte out[48], const blst_p1 *in);
+void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in);
+void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in);
+BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]);
+BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]);
+
+void blst_p2_serialize(byte out[192], const blst_p2 *in);
+void blst_p2_compress(byte out[96], const blst_p2 *in);
+void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in);
+void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in);
+BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]);
+BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]);
+
+/*
+ * Specification defines two variants, 'minimal-signature-size' and
+ * 'minimal-pubkey-size'. To unify appearance we choose to distinguish
+ * them by suffix referring to the public key type, more specifically
+ * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to
+ * 'minimal-signature-size'. It might appear a bit counterintuitive
+ * in sign call, but no matter how you twist it, something is bound to
+ * turn a little odd.
+ */
+/*
+ * Secret-key operations.
+ */
+void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                 const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash,
+                                          const blst_scalar *SK);
+void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK);
+void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash,
+                                          const blst_scalar *SK);
+
+/*
+ * Pairing interface.
+ */
+#ifndef SWIG
+void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q,
+                                      const blst_p1_affine *P);
+void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f);
+void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q);
+void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68],
+                                            const blst_p1_affine *P);
+bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2);
+#endif
+
+#ifdef __BLST_CGO__
+typedef limb_t blst_pairing;
+#elif defined(__BLST_RUST_BINDGEN__)
+typedef struct {} blst_pairing;
+#else
+typedef struct blst_opaque blst_pairing;
+#endif
+
+size_t blst_pairing_sizeof(void);
+void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode,
+                       const byte *DST DEFNULL, size_t DST_len DEFNULL);
+const byte *blst_pairing_get_dst(const blst_pairing *ctx);
+void blst_pairing_commit(blst_pairing *ctx);
+BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx,
+                                           const blst_p2_affine *PK,
+                                           const blst_p1_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                            const blst_p2_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p1_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx,
+                                                 const blst_p2_affine *PK,
+                                                 const blst_p1_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx,
+                                                  const blst_p2_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p1_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx,
+                                           const blst_p1_affine *PK,
+                                           const blst_p2_affine *signature,
+                                           const byte *msg, size_t msg_len,
+                                           const byte *aug DEFNULL,
+                                           size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                            const blst_p1_affine *PK,
+                                            bool pk_grpchk,
+                                            const blst_p2_affine *signature,
+                                            bool sig_grpchk,
+                                            const byte *msg, size_t msg_len,
+                                            const byte *aug DEFNULL,
+                                            size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx,
+                                                 const blst_p1_affine *PK,
+                                                 const blst_p2_affine *sig,
+                                                 const byte *scalar,
+                                                 size_t nbits,
+                                                 const byte *msg,
+                                                 size_t msg_len,
+                                                 const byte *aug DEFNULL,
+                                                 size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx,
+                                                  const blst_p1_affine *PK,
+                                                  bool pk_grpchk,
+                                                  const blst_p2_affine *sig,
+                                                  bool sig_grpchk,
+                                                  const byte *scalar,
+                                                  size_t nbits,
+                                                  const byte *msg,
+                                                  size_t msg_len,
+                                                  const byte *aug DEFNULL,
+                                                  size_t aug_len DEFNULL);
+BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1);
+bool blst_pairing_finalverify(const blst_pairing *ctx,
+                              const blst_fp12 *gtsig DEFNULL);
+
+
+/*
+ * Customarily applications aggregate signatures separately.
+ * In which case application would have to pass NULLs for |signature|
+ * to blst_pairing_aggregate calls and pass aggregated signature
+ * collected with these calls to blst_pairing_finalverify. Inputs are
+ * Zcash-compatible "straight-from-wire" byte vectors, compressed or
+ * not.
+ */
+BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in,
+                                              const byte *zwire);
+BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in,
+                                              const byte *zwire);
+
+void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature);
+void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature);
+
+/*
+ * "One-shot" CoreVerify entry points.
+ */
+BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk,
+                                     const blst_p2_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk,
+                                     const blst_p1_affine *signature,
+                                     bool hash_or_encode,
+                                     const byte *msg, size_t msg_len,
+                                     const byte *DST DEFNULL,
+                                     size_t DST_len DEFNULL,
+                                     const byte *aug DEFNULL,
+                                     size_t aug_len DEFNULL);
+
+extern const blst_p1_affine BLS12_381_G1;
+extern const blst_p1_affine BLS12_381_NEG_G1;
+extern const blst_p2_affine BLS12_381_G2;
+extern const blst_p2_affine BLS12_381_NEG_G2;
+
+#include "blst_aux.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h
new file mode 100644
index 00000000000..d96b1f3dd3b
--- /dev/null
+++ b/crypto/internal/blst/blst_aux.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef __BLST_AUX_H__
+#define __BLST_AUX_H__
+/*
+ * This file lists interfaces that might be promoted to blst.h or removed,
+ * depending on their proven/unproven worthiness.
+ */
+
+void blst_fr_to(blst_fr *ret, const blst_fr *a);
+void blst_fr_from(blst_fr *ret, const blst_fr *a);
+
+void blst_fp_to(blst_fp *ret, const blst_fp *a);
+void blst_fp_from(blst_fp *ret, const blst_fp *a);
+
+bool blst_fp_is_square(const blst_fp *a);
+bool blst_fp2_is_square(const blst_fp2 *a);
+
+void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in);
+void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in);
+
+/*
+ * Below functions produce both point and deserialized outcome of
+ * SkToPk and Sign. However, deserialized outputs are pre-decorated
+ * with sign and infinity bits. This means that you have to bring the
+ * output into compliance prior returning to application. If you want
+ * compressed point value, then do [equivalent of]
+ *
+ *  byte temp[96];
+ *  blst_sk_to_pk2_in_g1(temp, out_pk, SK);
+ *  temp[0] |= 0x80;
+ *  memcpy(out, temp, 48);
+ *
+ * Otherwise do
+ *
+ *  blst_sk_to_pk2_in_g1(out, out_pk, SK);
+ *  out[0] &= ~0x20;
+ *
+ * Either |out| or |out_<point>| can be NULL.
+ */
+void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig,
+                         const blst_p2 *hash, const blst_scalar *SK);
+void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk,
+                          const blst_scalar *SK);
+void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig,
+                         const blst_p1 *hash, const blst_scalar *SK);
+
+#ifdef __BLST_RUST_BINDGEN__
+typedef struct {} blst_uniq;
+#else
+typedef struct blst_opaque blst_uniq;
+#endif
+
+size_t blst_uniq_sizeof(size_t n_nodes);
+void blst_uniq_init(blst_uniq *tree);
+bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len);
+
+#ifdef expand_message_xmd
+void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes,
+                        const unsigned char *aug, size_t aug_len,
+                        const unsigned char *msg, size_t msg_len,
+                        const unsigned char *DST, size_t DST_len);
+#else
+void blst_expand_message_xmd(byte *out, size_t out_len,
+                             const byte *msg, size_t msg_len,
+                             const byte *DST, size_t DST_len);
+#endif
+
+void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar,
+                                                            size_t nbits);
+void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar,
+                                                            size_t nbits);
+
+void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q,
+                                                   const blst_p1_affine *p);
+blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx);
+void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a);
+
+void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                    const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                      const byte *salt, size_t salt_len,
+                      const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len,
+                    const byte *salt, size_t salt_len,
+                    const byte *info DEFNULL, size_t info_len DEFNULL);
+void blst_derive_master_eip2333(blst_scalar *out_SK,
+                                const byte *IKM, size_t IKM_len);
+void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK,
+                               uint32_t child_index);
+
+void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex);
+void blst_fr_from_hexascii(blst_fr *ret, const byte *hex);
+void blst_fp_from_hexascii(blst_fp *ret, const byte *hex);
+
+size_t blst_p1_sizeof(void);
+size_t blst_p1_affine_sizeof(void);
+size_t blst_p2_sizeof(void);
+size_t blst_p2_affine_sizeof(void);
+size_t blst_fp12_sizeof(void);
+
+/*
+ * Single-shot SHA-256 hash function.
+ */
+void blst_sha256(byte out[32], const byte *msg, size_t msg_len);
+#endif
\ No newline at end of file

From 98bb9ccfc810cedb6456db0304fb9813381c6301 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 13:21:03 -0600
Subject: [PATCH 065/200] clarify internal blst package structure in comment

---
 crypto/internal/blst/blst.go | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go
index 97b9047d1e3..d9a58470d12 100644
--- a/crypto/internal/blst/blst.go
+++ b/crypto/internal/blst/blst.go
@@ -1,5 +1,14 @@
 /*
- * Copied from https://github.com/supranational/blst
+ * This package is equivalent to the BLST Go package including all Go exported
+ * functions. BLST outer Go layer is used to cross-check flow-go/crypto BLS implementation.
+ * Note that flow-go/crypto uses BLST internal tools only to implement protocols based on BLS12-381,
+ * but does not use BLST outer layer and BLS implementation.
+ * Ideally, the cross-check tests would import github.com/supranational/blst. However this is
+ * not possible in Go as it causes multiple duplicated C objects. Creating the internal blst
+ * package is a workaround to achieve the same purpose. Note that the internal package
+ * implicitly uses the C objects declared by flow-go/crypto.
+ *
+ * Copied from https://github.com/supranational/blst.
  * Copyright Supranational LLC
  * Licensed under the Apache License, Version 2.0, see LICENSE for details.
  * SPDX-License-Identifier: Apache-2.0

From d729688d5383f888d7288478a073e57adf8ba969 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 13:21:36 -0600
Subject: [PATCH 066/200] update bls cross-checks

---
 crypto/bls_crossBLST_test.go | 53 +++++++++++++++---------------------
 crypto/go.mod                |  1 +
 crypto/go.sum                |  2 ++
 3 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index e9f9a902d0b..f08cc52152c 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -16,24 +16,17 @@ package crypto
 // both libraries might have made different choices. It is nevertheless a good flag for possible bugs or deviations
 // from the standard as both libraries are being developed.
 
-/*import (
+import (
 	"testing"
 
+	"github.com/onflow/flow-go/crypto/internal/blst"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	blst "github.com/supranational/blst/bindings/go"
 	"pgregory.net/rapid"
-
-	"github.com/onflow/flow-go/crypto"
-)*/
-
-// TODO: this file can't compile because of duplicate C and assembly symbols (the ones used
-// by the current library and the same ones used by the imported package BLST). Unfortunately,
-// cgo doesn't differentiate the two symbols. These tests need to be rewritten using the internal
-// BLST C exports, instead of importing the Go BLST package.
+)
 
 // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library
-/*func validPrivateKeyBytesFlow(t *rapid.T) []byte {
+func validPrivateKeyBytesFlow(t *rapid.T) []byte {
 	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	// TODO: require.NoError(t, err) seems to mess with rapid
@@ -56,7 +49,7 @@ func validSignatureBytesFlow(t *rapid.T) []byte {
 	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	require.NoError(t, err)
-	hasher := crypto.NewExpandMsgXOFKMAC128("random_tag")
+	hasher := NewExpandMsgXOFKMAC128("random_tag")
 	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte)
 	signature, err := sk.Sign(message, hasher)
 	require.NoError(t, err)
@@ -89,14 +82,14 @@ func validSignatureBytesBLST(t *rapid.T) []byte {
 // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST.
 // This test assumes private key serialization is identical to the one in BLST.
 func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PrKeyLenBLSBLS12381, crypto.PrKeyLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381)
 	validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST)
 	// skBytes are bytes of either a valid or a random private key
 	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
-	skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes)
+	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	var skBLST blst.Scalar
 	res := skBLST.Deserialize(skBytes)
 
@@ -116,14 +109,14 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
 // testEncodeDecodePublicKeyCrossBLST tests encoding and decoding of public keys keys are consistent with BLST.
 // This test assumes public key serialization is identical to the one in BLST.
 func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PubKeyLenBLSBLS12381, crypto.PubKeyLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), PubKeyLenBLSBLS12381, PubKeyLenBLSBLS12381)
 	validSliceFlow := rapid.Custom(validPublicKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPublicKeyBytesBLST)
 	// pkBytes are bytes of either a valid or a random public key
 	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
-	pkFlow, err := crypto.DecodePublicKey(crypto.BLSBLS12381, pkBytes)
+	pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes)
 	var pkBLST blst.P2Affine
 	res := pkBLST.Deserialize(pkBytes)
 	pkValidBLST := pkBLST.KeyValidate()
@@ -136,34 +129,32 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
 	if flowPass && blstPass {
 		pkFlowOutBytes := pkFlow.Encode()
 		pkBLSTOutBytes := pkBLST.Compress()
-
 		assert.Equal(t, pkFlowOutBytes, pkBLSTOutBytes)
 	}
 }
 
-// testEncodeDecodeSignatureCrossBLST tests encoding and decoding of signatures are consistent with BLST.
-// This test assumes signature serialization is identical to the one in BLST.
-func testEncodeDecodeSignatureCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.SignatureLenBLSBLS12381, crypto.SignatureLenBLSBLS12381)
+// testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST.
+// This test assumes signature serialization is identical to BLST.
+func testEncodeDecodeG1CrossBLST(t *rapid.T) {
+	randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381)
 	validSignatureFlow := rapid.Custom(validSignatureBytesFlow)
 	validSignatureBLST := rapid.Custom(validSignatureBytesBLST)
-	// sigBytes are bytes of either a valid or a random signature
+	// sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes
 	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte)
 
 	// check decoding results are consistent
 	var pointFlow pointE1
-	// here we test readPointE1 rather than the simple Signature type alias
 	err := readPointE1(&pointFlow, sigBytes)
-	flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid))
+	flowPass := (err == nil) && (checkMembershipG1(&pointFlow))
 
 	var pointBLST blst.P1Affine
+	// res is non-nil iff point is in G1
 	res := pointBLST.Uncompress(sigBytes)
-	// flow validation has no infinity rejection for G1
 	blstPass := (res != nil) && pointBLST.SigValidate(false)
 
-	require.Equal(t, flowPass, blstPass, "deserialization of signature %x differs", sigBytes)
+	require.Equal(t, flowPass, blstPass, "deserialization of G1 %x differs", sigBytes)
 
-	// check both signatures (G1 points) are equal
+	// check both serializations of G1 points are equal
 	if flowPass && blstPass {
 		sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381)
 		writePointG1(sigFlowOutBytes, &pointFlow)
@@ -187,7 +178,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 	// generate two private keys from the same seed
 	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte)
 
-	skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes)
+	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	require.NoError(t, err)
 	var skBLST blst.Scalar
 	res := skBLST.Deserialize(skBytes)
@@ -221,10 +212,10 @@ func testKeyGenCrossBLST(t *rapid.T) {
 	assert.Equal(t, skFlow.Encode(), skBLST.Serialize())
 }
 
-func TestAgainstBLST(t *testing.T) {
+func TestCrossBLST(t *testing.T) {
 	rapid.Check(t, testKeyGenCrossBLST)
 	rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST)
 	rapid.Check(t, testEncodeDecodePublicKeyCrossBLST)
-	rapid.Check(t, testEncodeDecodeSignatureCrossBLST)
+	//rapid.Check(t, testEncodeDecodeG1CrossBLST)   // commented till G1 check is implemented
 	rapid.Check(t, testSignHashCrossBLST)
-}*/
+}
diff --git a/crypto/go.mod b/crypto/go.mod
index 57c20ef9341..d10f7a17808 100644
--- a/crypto/go.mod
+++ b/crypto/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/stretchr/testify v1.8.0
 	golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d
 	gonum.org/v1/gonum v0.6.1
+	pgregory.net/rapid v0.4.7
 )
 
 require (
diff --git a/crypto/go.sum b/crypto/go.sum
index 181f9b302c0..820bb87a41c 100644
--- a/crypto/go.sum
+++ b/crypto/go.sum
@@ -52,4 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
+pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

From ab2fa14494785f89875f1333312f83f68405c42f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 16:09:55 -0600
Subject: [PATCH 067/200] upgrade rapid package

---
 crypto/bls_crossBLST_test.go | 29 +++++++++++++++--------------
 crypto/go.mod                |  2 +-
 crypto/go.sum                |  4 ++--
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index f08cc52152c..623409cd338 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -19,15 +19,16 @@ package crypto
 import (
 	"testing"
 
-	"github.com/onflow/flow-go/crypto/internal/blst"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"pgregory.net/rapid"
+
+	"github.com/onflow/flow-go/crypto/internal/blst"
 )
 
 // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library
 func validPrivateKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	// TODO: require.NoError(t, err) seems to mess with rapid
 	if err != nil {
@@ -38,7 +39,7 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte {
 
 // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library
 func validPublicKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	require.NoError(t, err)
 	return sk.PublicKey().Encode()
@@ -46,11 +47,11 @@ func validPublicKeyBytesFlow(t *rapid.T) []byte {
 
 // validSignatureBytesFlow generates bytes of a valid signature in Flow library
 func validSignatureBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	require.NoError(t, err)
 	hasher := NewExpandMsgXOFKMAC128("random_tag")
-	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte)
+	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg")
 	signature, err := sk.Sign(message, hasher)
 	require.NoError(t, err)
 	return signature
@@ -59,13 +60,13 @@ func validSignatureBytesFlow(t *rapid.T) []byte {
 // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library
 func validPrivateKeyBytesBLST(t *rapid.T) []byte {
 	randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen)
-	ikm := randomSlice.Draw(t, "ikm").([]byte)
+	ikm := randomSlice.Draw(t, "ikm")
 	return blst.KeyGen(ikm).Serialize()
 }
 
 // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library
 func validPublicKeyBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte)
+	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm")
 	blstS := blst.KeyGen(ikm)
 	blstG2 := new(blst.P2Affine).From(blstS)
 	return blstG2.Compress()
@@ -73,7 +74,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte {
 
 // validSignatureBytesBLST generates bytes of a valid signature in BLST library
 func validSignatureBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte)
+	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm")
 	blstS := blst.KeyGen(ikm[:])
 	blstG1 := new(blst.P1Affine).From(blstS)
 	return blstG1.Compress()
@@ -86,7 +87,7 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
 	validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST)
 	// skBytes are bytes of either a valid or a random private key
-	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
+	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example()
 
 	// check decoding results are consistent
 	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
@@ -113,7 +114,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
 	validSliceFlow := rapid.Custom(validPublicKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPublicKeyBytesBLST)
 	// pkBytes are bytes of either a valid or a random public key
-	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
+	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example()
 
 	// check decoding results are consistent
 	pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes)
@@ -140,7 +141,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 	validSignatureFlow := rapid.Custom(validSignatureBytesFlow)
 	validSignatureBLST := rapid.Custom(validSignatureBytesBLST)
 	// sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes
-	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte)
+	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example()
 
 	// check decoding results are consistent
 	var pointFlow pointE1
@@ -176,7 +177,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 // The test also assumes Flow signature serialization is identical to the one in BLST.
 func testSignHashCrossBLST(t *rapid.T) {
 	// generate two private keys from the same seed
-	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte)
+	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example()
 
 	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	require.NoError(t, err)
@@ -186,7 +187,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 
 	// generate two signatures using both libraries
 	blsCipher := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_")
-	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example().([]byte)
+	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example()
 
 	var sigBLST blst.P1Affine
 	sigBLST.Sign(&skBLST, message, blsCipher)
@@ -202,7 +203,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 }
 
 func testKeyGenCrossBLST(t *rapid.T) {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
 
 	skFlow, err := GeneratePrivateKey(BLSBLS12381, seed)
 	if err != nil {
diff --git a/crypto/go.mod b/crypto/go.mod
index d10f7a17808..bb3a1561b90 100644
--- a/crypto/go.mod
+++ b/crypto/go.mod
@@ -8,7 +8,7 @@ require (
 	github.com/stretchr/testify v1.8.0
 	golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d
 	gonum.org/v1/gonum v0.6.1
-	pgregory.net/rapid v0.4.7
+	pgregory.net/rapid v0.5.7
 )
 
 require (
diff --git a/crypto/go.sum b/crypto/go.sum
index 820bb87a41c..9126d59b7b2 100644
--- a/crypto/go.sum
+++ b/crypto/go.sum
@@ -52,6 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
-pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
+pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
+pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

From 9d35d3d5f4325da523dc7e09ed4333bad8bf9477 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 18:16:49 -0600
Subject: [PATCH 068/200] fix linter false positives

---
 crypto/internal/blst/blst.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go
index d9a58470d12..037e40d98a3 100644
--- a/crypto/internal/blst/blst.go
+++ b/crypto/internal/blst/blst.go
@@ -8,6 +8,8 @@
  * package is a workaround to achieve the same purpose. Note that the internal package
  * implicitly uses the C objects declared by flow-go/crypto.
  *
+ * Note: linter staticcheck was added in two spots to avoid linter false positives.
+ *
  * Copied from https://github.com/supranational/blst.
  * Copyright Supranational LLC
  * Licensed under the Apache License, Version 2.0, see LICENSE for details.
@@ -587,6 +589,7 @@ func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool,
 					// main thread has completed its miller loop before
 					// proceeding.
 					mutex.Lock()
+					//nolint:staticcheck
 					mutex.Unlock()
 				}
 
@@ -1205,6 +1208,7 @@ func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool,
 					// main thread has completed its miller loop before
 					// proceeding.
 					mutex.Lock()
+					//nolint:staticcheck
 					mutex.Unlock()
 				}
 

From 6031d71edf7b0e0071dddef0bf23e57c51d48b08 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 18:17:05 -0600
Subject: [PATCH 069/200] go mod tidy

---
 go.mod             | 2 +-
 go.sum             | 4 ++--
 insecure/go.sum    | 2 +-
 integration/go.sum | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/go.mod b/go.mod
index 64ea4ffb5ae..d8bade39895 100644
--- a/go.mod
+++ b/go.mod
@@ -95,7 +95,7 @@ require (
 	google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.2.0
 	google.golang.org/protobuf v1.30.0
 	gotest.tools v2.2.0+incompatible
-	pgregory.net/rapid v0.4.7
+	pgregory.net/rapid v0.5.7
 )
 
 require (
diff --git a/go.sum b/go.sum
index 630652a0d59..b7e715a259f 100644
--- a/go.sum
+++ b/go.sum
@@ -2255,8 +2255,8 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0=
 lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA=
 nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k=
 nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
-pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
-pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
+pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
+pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
diff --git a/insecure/go.sum b/insecure/go.sum
index 346f2c72189..5f842bc20b9 100644
--- a/insecure/go.sum
+++ b/insecure/go.sum
@@ -2100,7 +2100,7 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0=
 lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA=
 nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k=
 nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
-pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
+pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
diff --git a/integration/go.sum b/integration/go.sum
index a31e392c4c7..64e4d983caf 100644
--- a/integration/go.sum
+++ b/integration/go.sum
@@ -2338,7 +2338,7 @@ modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds=
 modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU=
 modernc.org/sqlite v1.21.1 h1:GyDFqNnESLOhwwDRaHGdp2jKLDzpyT/rNLglX3ZkMSU=
 modernc.org/sqlite v1.21.1/go.mod h1:XwQ0wZPIh1iKb5mkvCJ3szzbhk+tykC8ZWqTRTgYRwI=
-pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
+pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=

From 6b5fd8fffe0d6e8ed8a4b5ca3e305e33a438e716 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 18:49:33 -0600
Subject: [PATCH 070/200] Revert "upgrade rapid package"

This reverts commit ab2fa14494785f89875f1333312f83f68405c42f.
---
 crypto/bls_crossBLST_test.go | 29 ++++++++++++++---------------
 crypto/go.mod                |  2 +-
 crypto/go.sum                |  4 ++--
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index 623409cd338..f08cc52152c 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -19,16 +19,15 @@ package crypto
 import (
 	"testing"
 
+	"github.com/onflow/flow-go/crypto/internal/blst"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"pgregory.net/rapid"
-
-	"github.com/onflow/flow-go/crypto/internal/blst"
 )
 
 // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library
 func validPrivateKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	// TODO: require.NoError(t, err) seems to mess with rapid
 	if err != nil {
@@ -39,7 +38,7 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte {
 
 // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library
 func validPublicKeyBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	require.NoError(t, err)
 	return sk.PublicKey().Encode()
@@ -47,11 +46,11 @@ func validPublicKeyBytesFlow(t *rapid.T) []byte {
 
 // validSignatureBytesFlow generates bytes of a valid signature in Flow library
 func validSignatureBytesFlow(t *rapid.T) []byte {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 	sk, err := GeneratePrivateKey(BLSBLS12381, seed)
 	require.NoError(t, err)
 	hasher := NewExpandMsgXOFKMAC128("random_tag")
-	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg")
+	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte)
 	signature, err := sk.Sign(message, hasher)
 	require.NoError(t, err)
 	return signature
@@ -60,13 +59,13 @@ func validSignatureBytesFlow(t *rapid.T) []byte {
 // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library
 func validPrivateKeyBytesBLST(t *rapid.T) []byte {
 	randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen)
-	ikm := randomSlice.Draw(t, "ikm")
+	ikm := randomSlice.Draw(t, "ikm").([]byte)
 	return blst.KeyGen(ikm).Serialize()
 }
 
 // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library
 func validPublicKeyBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm")
+	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte)
 	blstS := blst.KeyGen(ikm)
 	blstG2 := new(blst.P2Affine).From(blstS)
 	return blstG2.Compress()
@@ -74,7 +73,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte {
 
 // validSignatureBytesBLST generates bytes of a valid signature in BLST library
 func validSignatureBytesBLST(t *rapid.T) []byte {
-	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm")
+	ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte)
 	blstS := blst.KeyGen(ikm[:])
 	blstG1 := new(blst.P1Affine).From(blstS)
 	return blstG1.Compress()
@@ -87,7 +86,7 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
 	validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST)
 	// skBytes are bytes of either a valid or a random private key
-	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example()
+	skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
 	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
@@ -114,7 +113,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
 	validSliceFlow := rapid.Custom(validPublicKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPublicKeyBytesBLST)
 	// pkBytes are bytes of either a valid or a random public key
-	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example()
+	pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte)
 
 	// check decoding results are consistent
 	pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes)
@@ -141,7 +140,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 	validSignatureFlow := rapid.Custom(validSignatureBytesFlow)
 	validSignatureBLST := rapid.Custom(validSignatureBytesBLST)
 	// sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes
-	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example()
+	sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte)
 
 	// check decoding results are consistent
 	var pointFlow pointE1
@@ -177,7 +176,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 // The test also assumes Flow signature serialization is identical to the one in BLST.
 func testSignHashCrossBLST(t *rapid.T) {
 	// generate two private keys from the same seed
-	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example()
+	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte)
 
 	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
 	require.NoError(t, err)
@@ -187,7 +186,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 
 	// generate two signatures using both libraries
 	blsCipher := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_")
-	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example()
+	message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example().([]byte)
 
 	var sigBLST blst.P1Affine
 	sigBLST.Sign(&skBLST, message, blsCipher)
@@ -203,7 +202,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 }
 
 func testKeyGenCrossBLST(t *rapid.T) {
-	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed")
+	seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte)
 
 	skFlow, err := GeneratePrivateKey(BLSBLS12381, seed)
 	if err != nil {
diff --git a/crypto/go.mod b/crypto/go.mod
index bb3a1561b90..d10f7a17808 100644
--- a/crypto/go.mod
+++ b/crypto/go.mod
@@ -8,7 +8,7 @@ require (
 	github.com/stretchr/testify v1.8.0
 	golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d
 	gonum.org/v1/gonum v0.6.1
-	pgregory.net/rapid v0.5.7
+	pgregory.net/rapid v0.4.7
 )
 
 require (
diff --git a/crypto/go.sum b/crypto/go.sum
index 9126d59b7b2..820bb87a41c 100644
--- a/crypto/go.sum
+++ b/crypto/go.sum
@@ -52,6 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
-pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
+pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
+pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

From e35860fedf113574fd043c45611225fb9dfd9ba0 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 18:50:01 -0600
Subject: [PATCH 071/200] Revert "go mod tidy"

This reverts commit 6031d71edf7b0e0071dddef0bf23e57c51d48b08.
---
 go.mod             | 2 +-
 go.sum             | 4 ++--
 insecure/go.sum    | 2 +-
 integration/go.sum | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/go.mod b/go.mod
index d8bade39895..64ea4ffb5ae 100644
--- a/go.mod
+++ b/go.mod
@@ -95,7 +95,7 @@ require (
 	google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.2.0
 	google.golang.org/protobuf v1.30.0
 	gotest.tools v2.2.0+incompatible
-	pgregory.net/rapid v0.5.7
+	pgregory.net/rapid v0.4.7
 )
 
 require (
diff --git a/go.sum b/go.sum
index b7e715a259f..630652a0d59 100644
--- a/go.sum
+++ b/go.sum
@@ -2255,8 +2255,8 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0=
 lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA=
 nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k=
 nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
-pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
-pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04=
+pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
+pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
diff --git a/insecure/go.sum b/insecure/go.sum
index 5f842bc20b9..346f2c72189 100644
--- a/insecure/go.sum
+++ b/insecure/go.sum
@@ -2100,7 +2100,7 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0=
 lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA=
 nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k=
 nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0=
-pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
+pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
diff --git a/integration/go.sum b/integration/go.sum
index 64e4d983caf..a31e392c4c7 100644
--- a/integration/go.sum
+++ b/integration/go.sum
@@ -2338,7 +2338,7 @@ modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds=
 modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU=
 modernc.org/sqlite v1.21.1 h1:GyDFqNnESLOhwwDRaHGdp2jKLDzpyT/rNLglX3ZkMSU=
 modernc.org/sqlite v1.21.1/go.mod h1:XwQ0wZPIh1iKb5mkvCJ3szzbhk+tykC8ZWqTRTgYRwI=
-pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs=
+pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=

From 22b1ef86061ca290ffc11abcc5522809cce2c665 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 18:51:44 -0600
Subject: [PATCH 072/200] fix linter error

---
 crypto/bls_crossBLST_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index f08cc52152c..e67c3c3bc33 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -19,10 +19,11 @@ package crypto
 import (
 	"testing"
 
-	"github.com/onflow/flow-go/crypto/internal/blst"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"pgregory.net/rapid"
+
+	"github.com/onflow/flow-go/crypto/internal/blst"
 )
 
 // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library

From 0290a985a92de6f147d5289546faa3b0a8ec6f04 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 8 May 2023 21:33:50 -0600
Subject: [PATCH 073/200] clean up of multiple pairing

---
 crypto/bls12381_utils.c |  9 ---------
 crypto/bls_core.c       | 11 +----------
 crypto/bls_include.h    |  4 ----
 3 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index ccec6c78d17..cd1ebd543d1 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1073,7 +1073,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
     ep2_copy(elemsG2[0], tmp);
     free(tmp);
 
-#if DOUBLE_PAIRING  
     // elemsG2[0] = -pk2
     ep2_neg(elemsG2[0], elemsG2[0]);
 
@@ -1085,14 +1084,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
     // compare the result to 1
     int res = fp12_cmp_dig(pair, 1);
 
-#elif SINGLE_PAIRING   
-    fp12_t pair1, pair2;
-    fp12_new(&pair1); fp12_new(&pair2);
-    pp_map_oatep_k12(pair1, elemsG1[0], elemsG2[0]);
-    pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]);
-
-    int res = fp12_cmp(pair1, pair2);
-#endif
     fp12_free(&one);
     ep_free(elemsG1[0]);
     ep_free(elemsG1[1]);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 58a7287578f..815f1893375 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -84,7 +84,6 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int
 
     int ret = UNDEFINED;
 
-#if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
     ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded
 
@@ -95,15 +94,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int
 
     // compare the result to 1
     int res = fp12_cmp_dig(pair, 1);
-
-#elif SINGLE_PAIRING   
-    fp12_t pair1, pair2;
-    fp12_new(&pair1); fp12_new(&pair2);
-    pp_map_oatep_k12(pair1, elemsG1[0], core_get()->ep2_g);
-    pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]);
-
-    int res = fp12_cmp(pair1, pair2);
-#endif   
+   
     if (core_get()->code == RLC_OK) {
         if (res == RLC_EQ) {
             ret = VALID;
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index d0f9120beb2..21a8d9fda59 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -16,10 +16,6 @@
 #define SK_BITS             (Fr_BITS)
 #define SK_LEN              BITS_TO_BYTES(SK_BITS)    
 
-// Simultaneous Pairing in verification
-#define DOUBLE_PAIRING 1
-#define SINGLE_PAIRING (DOUBLE_PAIRING^1)
-
 // algorithm choice for hashing to G1 
 // both methods are similar implementations of the same optimized SSWU 
 // but offer different timings.

From 7f86c9492054a0411d39524f2fba6652438d4c8c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 11 May 2023 01:06:28 -0600
Subject: [PATCH 074/200] Fp12 and multi-pairing computation

---
 crypto/bls12381_utils.c | 112 +++++++++++++++++++++++++++++++++++++++-
 crypto/bls12381_utils.h |   5 ++
 crypto/blst_include.h   |   4 ++
 3 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index cd1ebd543d1..8d724b28a7c 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -158,6 +158,9 @@ void Fr_set_limb(Fr* a, const limb_t l){
 }
 
 void Fr_copy(Fr* res, const Fr* a) {
+    if ((uptr_t)a==(uptr_t)res) {
+        return;
+    }
     vec_copy((byte*)res, (byte*)a, sizeof(Fr));
 }
 
@@ -386,6 +389,9 @@ void Fp_set_limb(Fp* a, const limb_t l){
 }
 
 void Fp_copy(Fp* res, const Fp* a) {
+    if ((uptr_t)a==(uptr_t)res) {
+        return;
+    }
     vec_copy((byte*)res, (byte*)a, sizeof(Fp));
 }
 
@@ -578,6 +584,31 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) {
 
 // ------------------- G1 utilities
 
+// res = p
+void  E1_copy(E1* res, const E1* p) {
+    if ((uptr_t)p==(uptr_t)res) {
+        return;
+    }
+    vec_copy(res, p, sizeof(E1));
+}
+
+// check if `p` is infinity
+bool_t E1_is_infty(const E1* p) {
+    // BLST infinity points are defined by Z=0
+    return vec_is_zero(p->z, sizeof(p->z));
+}
+
+// converts an E1 point from Jacobian into affine coordinates (z=1)
+void E1_to_affine(E1* res, const E1* p) {
+    // optimization in case coordinates are already affine
+    if (vec_is_equal(p->z, BLS12_381_pR, sizeof(p->z))) {
+        E1_copy(res, p);
+        return;
+    }
+    // convert from Jacobian
+    POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p);   
+}
+
 // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form.
 // len is the size of the input buffer.
 //
@@ -967,13 +998,16 @@ bool_t E2_is_equal(const E2* p1, const E2* p2) {
 
 // res = p
 void  E2_copy(E2* res, const E2* p) {
+    if ((uptr_t)p==(uptr_t)res) {
+        return;
+    }
     vec_copy(res, p, sizeof(E2));
 }
 
 // converts an E2 point from Jacobian into affine coordinates (z=1)
 void E2_to_affine(E2* res, const E2* p) {
-    // minor optimization in case coordinates are already affine
-    if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) {
+    // optimization in case coordinates are already affine
+    if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) {
         E2_copy(res, p);
         return;
     }
@@ -1296,6 +1330,80 @@ BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) {
     return BLST_SUCCESS;
 }
 
+// ------------------- Pairing utilities 
+
+bool_t Fp12_is_one(Fp12 *a) {
+    return vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) &
+           vec_is_zero(a[0][1], sizeof(a) - sizeof(a[0][0]));
+}
+
+static void Fp12_set_one(Fp12 *a) {
+    vec_copy(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0]));
+    vec_zero(a[0][1], sizeof(a) - sizeof(a[0][0]));
+}
+
+// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) 
+// by optimizing a common final exponentiation for all pairings.
+// result is stored in `res`.
+// It assumes `p` and `q` are correctly initialized and all 
+// p[i] and q[i] are respectively on G1 and G2 (it does not
+// check their memberships).
+void multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
+    // N_MAX is defined within BLST. It should represent a good tradeoff of the max number
+    // of miller loops to be batched in one call to `miller_loop_n`.
+    E1 p_[N_MAX];
+    E2 q_[N_MAX];
+    int n = 0; // the number of couples (p,q) held p_ and q_
+    int init_flag = 0;
+
+    // easier access pointers
+    vec384fp6* res_vec = (vec384fp6*)res;
+    POINTonE1_affine* p_POINT =  (POINTonE1_affine*)p_;
+    POINTonE2_affine* q_POINT =  (POINTonE2_affine*)q_;
+
+
+    for (int i=0; i<len; i++) {
+        if (E1_is_infty(p + i) || E2_is_infty(q + i)) {
+            continue;
+        }
+        // `miller_loop_n` expects affine coordinates
+        E1_to_affine(p_ + i, p + i); E2_to_affine(q_ + i, q + i);
+        n++;
+        if (n==N_MAX) {  // if p_ and q_ are filled batch `N_MAX` miller loops
+            if (!init_flag) {
+                miller_loop_n(res_vec, q_POINT, p_POINT, N_MAX); 
+                init_flag = 1;
+            } else {
+                vec384fp12 tmp;
+                miller_loop_n(tmp, q_POINT, p_POINT, N_MAX);
+                mul_fp12(res_vec, res_vec, tmp);
+            }
+            n = 0;
+        }
+    }
+    // if p_ and q_ aren't empty,
+    // remaining couples are also batched in `n` miller loops
+    if (n > 0) {
+        if (!init_flag) {
+            miller_loop_n(res_vec, q_POINT, p_POINT, n); 
+            init_flag = 1;
+        } else {
+            vec384fp12 tmp;
+            miller_loop_n(tmp, q_POINT, p_POINT, n);
+            mul_fp12(res_vec, res_vec, tmp);
+        } 
+    }
+
+    // check if no miller loop was computed
+    if (!init_flag) {
+        Fp12_set_one(res);
+    }
+
+    final_exp(res_vec, res_vec);
+}
+
+
+
 // This is a testing function.
 // It wraps a call to a Relic macro since cgo can't call macros.
 void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 5244e8cd16a..6df825b3f57 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -131,6 +131,7 @@ void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
 int      E1_in_G1(const ep_t);
+bool_t   E1_is_infty(const E1*);
 int      G1_simple_subgroup_check(const ep_t);
 void     map_bytes_to_G1(E1*, const uint8_t*, int);
 void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
@@ -157,6 +158,10 @@ bool_t      E2_in_G2(const E2*);
 void        map_bytes_to_G2(E2*, const uint8_t*, int);
 BLST_ERROR  map_bytes_to_G2complement(E2*, const uint8_t*, int);
 
+// pairing and Fp12
+bool_t      Fp12_is_one(Fp12*);
+void        multi_pairing(Fp12*, const E1*, const E2*, const int);
+
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
 prec_st* init_precomputed_data_BLS12_381();
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 64b8e4562b8..1f7b2484a3c 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -104,4 +104,8 @@ typedef vec384x Fp2;
 // `E2` is also used to represent all subgroup G_2 elements. 
 typedef struct {Fp2 x,y,z;} E2;
 
+// Fp12 is the codomain of the pairing function `e`, specifically the subgroup
+// G_T of Fp12.
+// Fp12 represents G_T elements and is equivalent to `vec384fp12` (used internally by BLST)
+typedef vec384fp12 Fp12;
 #endif

From 7b7e484b83e423df6879b75252a0e0892152b7c4 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 11 May 2023 18:32:49 -0600
Subject: [PATCH 075/200] use E1 blst type in Go

---
 crypto/bls.go            |  2 +-
 crypto/bls12381_utils.go | 37 ++++++++++++++-----------------------
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 56225332562..804c34b619c 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -550,7 +550,7 @@ func (a *blsBLS12381Algo) init() error {
 func mapToG1(data []byte) *pointE1 {
 	l := len(data)
 	var h pointE1
-	C.map_to_G1((*C.ep_st)(&h), (*C.uchar)(&data[0]), (C.int)(l))
+	C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l))
 	return &h
 }
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 0f685494d4f..df1df3e4f2b 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -41,17 +41,15 @@ import (
 
 // Go wrappers around BLST C types
 // Go wrappers around Relic C types
-type pointE1 C.ep_st
+type pointE1 C.E1
 type pointE2 C.E2
 type scalar C.Fr
 
-// BLS12-381 related lengths
-var frBytesLen = int(C.get_Fr_BYTES())
-
 // TODO: For now scalars are represented as field elements Fr since all scalars
 // are less than r - check if distinguishing two types in necessary
-//type pointG1_blst C.E1
-//type pointG2_blst C.E2
+
+// BLS12-381 related lengths
+var frBytesLen = int(C.get_Fr_BYTES())
 
 // context required for the BLS set-up
 type ctx struct {
@@ -94,24 +92,17 @@ func (ct *ctx) initContext() error {
 	return nil
 }
 
-// Exponentiation in G1 (scalar point multiplication)
+// Scalar multiplication of a generic point `p` in G1
 func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) {
-	C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo))
+	C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo))
 }
 
-// This function is for TEST only
-// Exponentiation of g1 in G1
+// Scalar multiplication of generator g1 in G1
 func generatorScalarMultG1(res *pointE1, expo *scalar) {
-	C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo))
-}
-
-// This function is for TEST only
-// Generic Exponentiation G1
-func genericScalarMultG1(res *pointE1, expo *scalar) {
-	C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo))
+	C.G1_mult_gen((*C.E1)(res), (*C.Fr)(expo))
 }
 
-// Exponentiation of g2 in G2
+// Scalar multiplication of generator g2 in G2
 func generatorScalarMultG2(res *pointE2, expo *scalar) {
 	C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo))
 }
@@ -187,7 +178,7 @@ func writePointG2(dest []byte, a *pointE2) {
 // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointG1(dest []byte, a *pointE1) {
 	C.ep_write_bin_compact((*C.uchar)(&dest[0]),
-		(*C.ep_st)(a),
+		(*C.E1)(a),
 		(C.int)(signatureLengthBLSBLS12381),
 	)
 }
@@ -240,7 +231,7 @@ func readPointE2(a *pointE2, src []byte) error {
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
 // No G1 membership check is performed.
 func readPointE1(a *pointE1, src []byte) error {
-	switch C.ep_read_bin_compact((*C.ep_st)(a),
+	switch C.ep_read_bin_compact((*C.E1)(a),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src))) {
 	case valid:
@@ -269,13 +260,13 @@ func checkMembershipG2(pt *pointE2) bool {
 // randPointG1 wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in G1 and stores it in input point.
 func randPointG1(pt *pointE1) {
-	C.ep_rand_G1((*C.ep_st)(pt))
+	C.ep_rand_G1((*C.E1)(pt))
 }
 
 // randPointG1Complement wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E1\G1 and stores it in input point.
 func randPointG1Complement(pt *pointE1) {
-	C.ep_rand_G1complement((*C.ep_st)(pt))
+	C.ep_rand_G1complement((*C.E1)(pt))
 }
 */
 
@@ -311,7 +302,7 @@ func hashToG1Bytes(data, dst []byte) []byte {
 
 	// map the hash to G1
 	var point pointE1
-	C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
+	C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
 
 	// serialize the point
 	pointBytes := make([]byte, signatureLengthBLSBLS12381)

From 7cafb2b0723462ba9034e2e9324e04e0da8aee6d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 11 May 2023 18:33:36 -0600
Subject: [PATCH 076/200] use POINTonEx_mult_glv instead of blst_sign and
 remove Relic's E1 mult

---
 crypto/bls12381_utils.c       | 39 ++++++++++++++++-------------------
 crypto/bls12381_utils.h       | 10 ++++-----
 crypto/bls12381_utils_test.go | 22 +++++++-------------
 3 files changed, 31 insertions(+), 40 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index ccec6c78d17..f188cb0d33e 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -701,26 +701,20 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
     bin[0] |= (G1_SERIALIZATION << 7);
  }
 
-// Exponentiation of a generic point p in G1
-void ep_mult(ep_t res, const ep_t p, const Fr *expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using window NAF of size 2 
-    ep_mul_lwnaf(res, p, tmp_expo);
-    free(tmp_expo);
-}
-
-// Exponentiation of generator g1 in G1
-// These two function are here for bench purposes only
-void ep_mult_gen_bench(ep_t res, const Fr* expo) {
-    bn_st* tmp_expo = Fr_blst_to_relic(expo);
-    // Using precomputed table of size 4
-    ep_mul_gen(res, tmp_expo);
-    free(tmp_expo);
+// Exponentiation of a generic point `a` in E1, res = expo.a
+void E1_mult(E1* res, const E1* p, const Fr* expo) {
+    pow256 tmp;
+    pow256_from_Fr(tmp, expo);
+    POINTonE1_mult_glv((POINTonE1*)res, (POINTonE1*)p, tmp);
+    vec_zero(&tmp, sizeof(tmp));
 }
 
-void ep_mult_generic_bench(ep_t res, const Fr* expo) {
-    // generic point multiplication
-    ep_mult(res, &core_get()->ep_g, expo);
+// Exponentiation of generator g1 of G1, res = expo.g1
+void G1_mult_gen(E1* res, const Fr* expo) {
+    pow256 tmp;
+    pow256_from_Fr(tmp, expo);
+    POINTonE1_mult_gls((POINTonE1*)res, &BLS12_381_G1, tmp);
+    vec_zero(&tmp, sizeof(tmp));
 }
 
 // ------------------- E2 utilities
@@ -996,7 +990,8 @@ static void E2_neg(E2* a) {
 void E2_mult(E2* res, const E2* p, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
-    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp);
+    POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, tmp);
+    vec_zero(&tmp, sizeof(tmp)); 
 }
 
 // Exponentiation of a generic point `a` in E2 by a byte exponent.
@@ -1005,14 +1000,16 @@ void  E2_mult_small_expo(E2* res, const E2* p, const byte expo) {
     vec_zero(&pow_expo, sizeof(pow256)); 
     pow_expo[0] = expo; // `pow256` uses bytes little endian.
     // TODO: to bench against a specific version of mult with 8 bits expo
-    POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, pow_expo);
+    POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, pow_expo);
+    pow_expo[0] = 0;
 }
 
 // Exponentiation of generator g2 of G2, res = expo.g2
 void G2_mult_gen(E2* res, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
-    POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp);
+    POINTonE2_mult_gls((POINTonE2*)res, &BLS12_381_G2, tmp);
+    vec_zero(&tmp, sizeof(tmp));
 }
 
 // checks if input E2 point is on the subgroup G2.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 5244e8cd16a..625e954397f 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -123,15 +123,15 @@ void    Fp_mul_montg(Fp *, const Fp *, const Fp *);
 void    Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
+int      E1_in_G1(const ep_t);
+int      G1_simple_subgroup_check(const ep_t);
+void     E1_mult(E1*, const E1*, const Fr*);
+void     G1_mult_gen(E1*, const Fr*);
+
 int      ep_read_bin_compact(ep_t, const byte *, const int);
 void     ep_write_bin_compact(byte *, const ep_t,  const int);
-void     ep_mult_gen_bench(ep_t, const Fr*);
-void     ep_mult_generic_bench(ep_t, const Fr*);
-void     ep_mult(ep_t, const ep_t, const Fr*);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
-int      E1_in_G1(const ep_t);
-int      G1_simple_subgroup_check(const ep_t);
 void     map_bytes_to_G1(E1*, const uint8_t*, int);
 void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
 #if  (MEMBERSHIP_CHECK_G1 == BOWE)
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 563ca26811b..9a9026e4056 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -13,7 +13,7 @@ import (
 )
 
 // G1 and G2 scalar multiplication
-func BenchmarkScalarMultG1G2(b *testing.B) {
+func BenchmarkScalarMult(b *testing.B) {
 	seed := make([]byte, securityBits/8)
 	_, err := mrand.Read(seed)
 	require.NoError(b, err)
@@ -22,7 +22,9 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 	_ = mapToFr(&expo, seed)
 
 	// G1 generator multiplication
-	b.Run("G1 gen", func(b *testing.B) {
+	// Note that generator and random point multiplications
+	// are implemented with the same algorithm
+	b.Run("G1", func(b *testing.B) {
 		var res pointE1
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
@@ -31,18 +33,10 @@ func BenchmarkScalarMultG1G2(b *testing.B) {
 		b.StopTimer()
 	})
 
-	// G1 base point multiplication
-	b.Run("G1 generic", func(b *testing.B) {
-		var res pointE1
-		b.ResetTimer()
-		for i := 0; i < b.N; i++ {
-			genericScalarMultG1(&res, &expo)
-		}
-		b.StopTimer()
-	})
-
-	// G2 base point multiplication
-	b.Run("G2 gen", func(b *testing.B) {
+	// G2 generator multiplication
+	// Note that generator and random point multiplications
+	// are implemented with the same algorithm
+	b.Run("G2", func(b *testing.B) {
 		var res pointE2
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {

From 109b238acb024071927fb68f6e364f7f8ac90199 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 11 May 2023 20:06:37 -0600
Subject: [PATCH 077/200] E1_read_bytes and E1_write_bytes and their tools

---
 crypto/bls12381_utils.c | 231 +++++++++++++++++++++++++++++++++-------
 crypto/bls12381_utils.h |  15 ++-
 crypto/bls_core.c       |   5 +-
 3 files changed, 204 insertions(+), 47 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index f188cb0d33e..c0087591e8f 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -375,17 +375,17 @@ bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
 const Fp BLS12_381_pR = { ONE_MONT_P };        /* R mod p = (1<<384)%p */
 
 // sets `a` to 0
-void Fp_set_zero(Fp* a){
+static void Fp_set_zero(Fp* a){
     vec_zero((byte*)a, sizeof(Fp));
 }
 
 // sets `a` to limb `l`
-void Fp_set_limb(Fp* a, const limb_t l){
+static void Fp_set_limb(Fp* a, const limb_t l){
     vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t));
     *((limb_t*)a) = l;
 }
 
-void Fp_copy(Fp* res, const Fp* a) {
+static void Fp_copy(Fp* res, const Fp* a) {
     vec_copy((byte*)res, (byte*)a, sizeof(Fp));
 }
 
@@ -393,14 +393,24 @@ static void Fp_add(Fp *res, const Fp *a, const Fp *b) {
     add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
 }
 
-void Fp_sub(Fp *res, const Fp *a, const Fp *b) {
+static void Fp_sub(Fp *res, const Fp *a, const Fp *b) {
     sub_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
 }
 
-void Fp_neg(Fp *res, const Fp *a) {
+static void Fp_neg(Fp *res, const Fp *a) {
     cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P);
 }
 
+// checks if `a` is a quadratic residue in Fp. If yes, it computes 
+// the square root in `res`.
+// 
+// The boolean output is valid whether `a` is in Montgomery form or not,
+// since montgomery constant `R` is a quadratic residue.
+// However, the square root is valid only if `a` is in montgomery form.
+static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) {
+   return sqrt_fp((limb_t*)res, (limb_t*)a);
+}
+
 static bool check_Fp(const Fp* in) {
     // use same method as in BLST internal function
     // which seems the most efficient. The method uses the assembly-based 
@@ -497,37 +507,39 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
 static byte Fp_get_sign(const Fp* y) {
-    return sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0);
+    // BLST's sgn0_pty_mont_384 requires input to be in Montg form.
+    // The needed sign bit is on position 1 !
+    return (sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0)>>1) & 1;
 }
 
 // ------------------- Fp^2 utilities
 
 // sets `a` to limb `l`
-void Fp2_set_limb(Fp2* a, const limb_t l){
+static void Fp2_set_limb(Fp2* a, const limb_t l){
     Fp_set_limb(&real(a), l);  
     Fp_set_zero(&imag(a));
 }
 
-void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) {
     add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
 }
 
-void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) {
     sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
 }
 
-void Fp2_neg(Fp2 *res, const Fp2 *a) {
+static void Fp2_neg(Fp2 *res, const Fp2 *a) {
     cneg_mod_384(real(res), real(a), 1, BLS12_381_P);
     cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P);
 }
 
 // res = a*b in montgomery form
-void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) {
+static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) {
     mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); 
 }
 
 // res = a^2 in montgomery form
-void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
+static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
     sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); 
 }
 
@@ -537,7 +549,7 @@ void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
 // The boolean output is valid whether `a` is in Montgomery form or not,
 // since montgomery constant `R` is a quadratic residue.
 // However, the square root is valid only if `a` is in montgomery form.
-static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) {
+static bool_t Fp2_sqrt_montg(Fp2 *res, const Fp2* a) {
    return sqrt_fp2((vec384*)res, (vec384*)a);
 }
 
@@ -545,6 +557,8 @@ static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) {
 // sign(y_0) if y_1 = 0, else sign(y_1)
 // y coordinates must be in montgomery form
 static byte Fp2_get_sign(Fp2* y) {
+    // BLST's sgn0_pty_mont_384x requires input to be in Montg form.
+    // The needed sign bit is on position 1 !
     return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1;
 }
 
@@ -578,16 +592,7 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) {
 
 // ------------------- G1 utilities
 
-// ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form.
-// len is the size of the input buffer.
-//
-// The resulting point is guaranteed to be on the curve E1.
-// The serialization follows:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
-// The code is a modified version of Relic ep_read_bin
-//
-// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and coordinates correspond
-// to a point on curve) and the execution completes, and RLC_ERR otherwise.
+// TODO: to delete, only used by temporary E2_blst_to_relic
 int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     // check the length
     const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
@@ -661,11 +666,7 @@ static int fp_get_sign(const fp_t y) {
     return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT;		
 }
 
-// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form.
-// len is the allocated size of the buffer bin.
-// The serialization is following:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
-// The code is a modified version of Relic ep_write_bin
+// TODO: to delete, only used by temporary E2_blst_to_relic
 void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
     const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
 
@@ -701,6 +702,158 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
     bin[0] |= (G1_SERIALIZATION << 7);
  }
 
+void E1_copy(E1* res, const E1* p) {
+    vec_copy(res, p, sizeof(E1));
+}
+
+// compare p to infinity
+bool_t E1_is_infty(const E1* p) {
+    // BLST infinity points are defined by Z=0
+    return vec_is_zero(p->z, sizeof(p->z));  
+}
+
+// set p to infinity
+void E1_set_infty(E1* p) {
+    // BLST infinity points are defined by Z=0
+    vec_zero(p->z, sizeof(p->z));  
+}
+
+// converts an E1 point from Jacobian into affine coordinates (z=1)
+void E1_to_affine(E1* res, const E1* p) {
+    // optimize in case coordinates are already affine
+    if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) {
+        E1_copy(res, p);
+        return;
+    }
+    // convert from Jacobian
+    POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p);   
+}
+
+// checks affine point `p` is in E1
+bool_t E1_affine_on_curve(const E1* p) {
+    // BLST's `POINTonE1_affine_on_curve` does not include the inifity case!
+    return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p);
+}
+
+// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form.
+// The resulting point is guaranteed to be on curve E1 (no G1 check is included).
+// Expected serialization follows:
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
+//
+// returns:
+//    - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BLST_BAD_SCALAR if Fp coordinates couldn't deserialize
+//    - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E1
+//    - BLST_SUCCESS if deserialization is valid 
+
+// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, 
+//       and update logic with G2 subgroup check?
+BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
+    // check the length
+    if (len != G1_SER_BYTES) {
+        return BLST_BAD_ENCODING;
+    }
+
+    // check the compression bit
+    int compressed = bin[0] >> 7;
+    if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
+        return BLST_BAD_ENCODING;
+    } 
+
+    // check if the point in infinity
+    int is_infinity = bin[0] & 0x40;
+    if (is_infinity) {
+        // the remaining bits need to be cleared
+        if (bin[0] & 0x3F) {
+            return BLST_BAD_ENCODING;
+        }
+        for (int i=1; i<G1_SER_BYTES-1; i++) {
+            if (bin[i]) {
+                return BLST_BAD_ENCODING;
+            } 
+        }
+		E1_set_infty(a);
+		return BLST_SUCCESS;
+	} 
+
+    // read the sign bit and check for consistency
+    int y_sign = (bin[0] >> 5) & 1;
+    if (y_sign && (!compressed)) {
+        return BLST_BAD_ENCODING;
+    } 
+    
+    // use a temporary buffer to mask the header bits and read a.x
+    byte temp[Fp_BYTES];
+    memcpy(temp, bin, Fp_BYTES);
+    temp[0] &= 0x1F;        // clear the header bits
+    BLST_ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
+    if (ret != BLST_SUCCESS) {
+        return ret;
+    }
+
+    // set a.z to 1
+    Fp* a_z = &(a->z); 
+    Fp_set_limb(a_z, 1);
+
+    if (G1_SERIALIZATION == UNCOMPRESSED) {
+        ret = Fp_read_bytes(&(a->y), bin + Fp_BYTES, sizeof(a->y));
+        if (ret != BLST_SUCCESS){ 
+            return ret;
+        }
+        // check read point is on curve
+        if (!E1_affine_on_curve(a)) { 
+            return BLST_POINT_NOT_ON_CURVE;
+        }
+        return BLST_SUCCESS;
+    }
+    
+    // compute the possible square root
+    Fp* a_x = &(a->x);
+    Fp_to_montg(a_x, a_x);
+
+    Fp* a_y = &(a->y);
+    Fp_squ_montg(a_y, a_x);
+    Fp_mul_montg(a_y, a_y, a_x);
+    Fp_add(a_y, a_y, &B_E1);          // B_E1 is already in Montg form             
+    if (!Fp_sqrt_montg(a_y, a_y))           // check whether x^3+b is a quadratic residue
+        return BLST_POINT_NOT_ON_CURVE; 
+
+    // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
+    if (Fp_get_sign(a_y) != y_sign) {
+        Fp_neg(a_y, a_y); // flip y sign if needed
+    }
+    return BLST_SUCCESS;
+}
+
+// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form.
+// It assumes buffer is of length G1_SER_BYTES
+// The serialization follows:
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
+void E1_write_bytes(byte *bin, const E1* a) {
+    if (E1_is_infty(a)) {
+            // set the infinity bit
+            bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);
+            memset(bin+1, 0, G1_SER_BYTES-1);
+            return;
+    }
+    E1 tmp;
+    E1_to_affine(&tmp, a); // TODO: implement
+
+    Fp* t_x = &(tmp.x);
+    Fp_from_montg(t_x, t_x);
+    Fp_write_bytes(bin, t_x);
+
+    Fp* t_y = &(tmp.y);
+    if (G1_SERIALIZATION == COMPRESSED) {
+        bin[0] |= (Fp_get_sign(t_y) << 5);
+    } else {
+        Fp_from_montg(t_y, t_y);
+        Fp_write_bytes(bin + Fp_BYTES, t_y);
+    }
+
+    bin[0] |= (G1_SERIALIZATION << 7);
+}
+
 // Exponentiation of a generic point `a` in E1, res = expo.a
 void E1_mult(E1* res, const E1* p, const Fr* expo) {
     pow256 tmp;
@@ -713,7 +866,7 @@ void E1_mult(E1* res, const E1* p, const Fr* expo) {
 void G1_mult_gen(E1* res, const Fr* expo) {
     pow256 tmp;
     pow256_from_Fr(tmp, expo);
-    POINTonE1_mult_gls((POINTonE1*)res, &BLS12_381_G1, tmp);
+    POINTonE1_mult_glv((POINTonE1*)res, &BLS12_381_G1, tmp);
     vec_zero(&tmp, sizeof(tmp));
 }
 
@@ -810,11 +963,9 @@ ep2_st* E2_blst_to_relic(const E2* x) {
     return out;
 }
 
-// E2_read_bytes imports a point from a buffer in a compressed or uncompressed form.
-// The resulting point is guaranteed to be on curve E2 (no G2 check is included)
+// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form.
+// The resulting point is guaranteed to be on curve E2 (no G2 check is included).
 //
-// reads a scalar in `a` and checks it is a valid Fp element (a < p).
-// input is bytes-big-endian.
 // returns:
 //    - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid
 //    - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize
@@ -848,7 +999,7 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
             } 
         }
 		E2_set_infty(a);
-		return RLC_OK;
+		return BLST_SUCCESS;
 	} 
 
     // read the sign bit and check for consistency
@@ -892,7 +1043,7 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     Fp2_squ_montg(a_y, a_x);
     Fp2_mul_montg(a_y, a_y, a_x);
     Fp2_add(a_y, a_y, &B_E2);          // B_E2 is already in Montg form             
-    if (!Fp2_sqrt(a_y, a_y))    // check whether x^3+b is a quadratic residue
+    if (!Fp2_sqrt_montg(a_y, a_y))    // check whether x^3+b is a quadratic residue
         return BLST_POINT_NOT_ON_CURVE; 
 
     // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
@@ -902,11 +1053,10 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     return BLST_SUCCESS;
 }
 
-// E2_write_bytes exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form.
+// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form.
 // It assumes buffer is of length G2_SER_BYTES
 // The serialization follows:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-// The code is a modified version of Relic ep2_write_bin 
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
 void E2_write_bytes(byte *bin, const E2* a) {
     if (E2_is_infty(a)) {
             // set the infinity bit
@@ -948,8 +1098,7 @@ bool_t E2_is_infty(const E2* p) {
 
 // checks affine point `p` is in E2
 bool_t E2_affine_on_curve(const E2* p) {
-    // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, 
-    // unlike what the function name means.
+    // BLST's `POINTonE2_affine_on_curve` does not include the infinity case!
     return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p);
 }
 
@@ -966,7 +1115,7 @@ void  E2_copy(E2* res, const E2* p) {
 
 // converts an E2 point from Jacobian into affine coordinates (z=1)
 void E2_to_affine(E2* res, const E2* p) {
-    // minor optimization in case coordinates are already affine
+    // optimize in case coordinates are already affine
     if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) {
         E2_copy(res, p);
         return;
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 625e954397f..24f43c96a26 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -123,10 +123,17 @@ void    Fp_mul_montg(Fp *, const Fp *, const Fp *);
 void    Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
-int      E1_in_G1(const ep_t);
-int      G1_simple_subgroup_check(const ep_t);
-void     E1_mult(E1*, const E1*, const Fr*);
-void     G1_mult_gen(E1*, const Fr*);
+void        E1_copy(E1*, const E1*);
+void        E1_set_infty(E1*);
+bool_t      E1_is_infty(const E1*);
+void        E1_to_affine(E1*, const E1*);
+bool_t      E1_affine_on_curve(const E1*);
+bool_t      E1_in_G1(const ep_t);
+int         G1_simple_subgroup_check(const ep_t);
+void        E1_mult(E1*, const E1*, const Fr*);
+void        G1_mult_gen(E1*, const Fr*);
+BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
+void        E1_write_bytes(byte *, const E1*);
 
 int      ep_read_bin_compact(ep_t, const byte *, const int);
 void     ep_write_bin_compact(byte *, const ep_t,  const int);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 58a7287578f..5ea02115d66 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -21,7 +21,7 @@ int get_sk_len() {
 
 // Checks if input point p is in the subgroup G1. 
 // The function assumes the input is known to be on the curve E1.
-int E1_in_G1(const ep_t p){
+bool_t E1_in_G1(const ep_t p){
 // TODO: to upadte
 /*
     #if MEMBERSHIP_CHECK_G1 == EXP_ORDER
@@ -42,7 +42,8 @@ static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) {
     ep_new(p);
 
     // s = h^sk
-    ep_mult(p, h, sk);
+    //ep_mult(p, h, sk);
+    ep_copy(p, h);
     ep_write_bin_compact(s, p, SIGNATURE_LEN);
     ep_free(p);
 }

From 5ae1abbf8a6a4a5697bbc17bae7f23ebf25fa103 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 11 May 2023 20:23:05 -0600
Subject: [PATCH 078/200] G1 membership check and connect E1 read/write to the
 Go layer

---
 crypto/bls.go                |  2 +-
 crypto/bls12381_utils.c      |  7 ++++++
 crypto/bls12381_utils.go     | 46 ++++++++++++++++++------------------
 crypto/bls12381_utils.h      |  2 +-
 crypto/bls_crossBLST_test.go |  2 +-
 5 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 804c34b619c..d814b1209c2 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -500,7 +500,7 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
 		panic("library is not configured to use compressed public key serialization")
 	}
 	dest := make([]byte, pubKeyLengthBLSBLS12381)
-	writePointG2(dest, &a.point)
+	writePointE2(dest, &a.point)
 	return dest
 }
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index c0087591e8f..c9fca01df9a 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -735,6 +735,13 @@ bool_t E1_affine_on_curve(const E1* p) {
     return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p);
 }
 
+// checks if input E1 point is on the subgroup G1.
+// It assumes input `p` is on E1.
+bool_t E1_in_G1(const E1* p){
+    // currently uses Scott method
+    return POINTonE1_in_G1((const POINTonE1*)p);
+}
+
 // E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form.
 // The resulting point is guaranteed to be on curve E1 (no G1 check is included).
 // Expected serialization follows:
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index df1df3e4f2b..bf6e4f996d0 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -77,7 +77,7 @@ func (a *scalar) String() string {
 
 func (p *pointE2) String() string {
 	encoding := make([]byte, pubKeyLengthBLSBLS12381)
-	writePointG2(encoding, p)
+	writePointE2(encoding, p)
 	return fmt.Sprintf("%#x", encoding)
 }
 
@@ -166,21 +166,18 @@ func writeScalar(dest []byte, x *scalar) {
 	C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x))
 }
 
-// writePointG2 writes a G2 point in a slice of bytes
+// writePointE2 writes a G2 point in a slice of bytes
 // The slice should be of size PubKeyLenBLSBLS12381 and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func writePointG2(dest []byte, a *pointE2) {
+func writePointE2(dest []byte, a *pointE2) {
 	C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a))
 }
 
-// writePointG1 writes a G1 point in a slice of bytes
-// The slice should be of size SignatureLenBLSBLS12381 and the serialization will
-// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
-func writePointG1(dest []byte, a *pointE1) {
-	C.ep_write_bin_compact((*C.uchar)(&dest[0]),
-		(*C.E1)(a),
-		(C.int)(signatureLengthBLSBLS12381),
-	)
+// writePointE1 writes a G1 point in a slice of bytes
+// The slice should be of size SignatureLenBLSBLS12381 and the serialization
+// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
+func writePointE1(dest []byte, a *pointE1) {
+	C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a))
 }
 
 // read an Fr* element from a byte slice
@@ -218,11 +215,11 @@ func readPointE2(a *pointE2, src []byte) error {
 	case blst_valid:
 		return nil
 	case blst_bad_encoding, blst_bad_scalar:
-		return invalidInputsErrorf("input could not deserialize to a G2 point")
+		return invalidInputsErrorf("input could not deserialize to a E2 point")
 	case blst_point_not_on_curve:
 		return invalidInputsErrorf("input is not a point on curve E2")
 	default:
-		return errors.New("reading a G2 point failed")
+		return errors.New("reading E2 point failed")
 	}
 }
 
@@ -231,23 +228,26 @@ func readPointE2(a *pointE2, src []byte) error {
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
 // No G1 membership check is performed.
 func readPointE1(a *pointE1, src []byte) error {
-	switch C.ep_read_bin_compact((*C.E1)(a),
+	read := C.E1_read_bytes((*C.E1)(a),
 		(*C.uchar)(&src[0]),
-		(C.int)(len(src))) {
-	case valid:
+		(C.int)(len(src)))
+
+	switch int(read) {
+	case blst_valid:
 		return nil
-	case invalid:
-		return invalidInputsErrorf("input is not a G1 point")
+	case blst_bad_encoding, blst_bad_scalar:
+		return invalidInputsErrorf("input could not deserialize to a E1 point")
+	case blst_point_not_on_curve:
+		return invalidInputsErrorf("input is not a point on curve E1")
 	default:
-		return errors.New("reading a G1 point failed")
+		return errors.New("reading E1 point failed")
 	}
 }
 
 // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used
 // in go test files.
 func checkMembershipG1(pt *pointE1) bool {
-	//return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0)
-	return true
+	return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0)
 }
 
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
@@ -302,10 +302,10 @@ func hashToG1Bytes(data, dst []byte) []byte {
 
 	// map the hash to G1
 	var point pointE1
-	C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
+	C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
 
 	// serialize the point
 	pointBytes := make([]byte, signatureLengthBLSBLS12381)
-	writePointG1(pointBytes, &point)
+	writePointE1(pointBytes, &point)
 	return pointBytes
 }
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 24f43c96a26..50bec52f133 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -128,7 +128,7 @@ void        E1_set_infty(E1*);
 bool_t      E1_is_infty(const E1*);
 void        E1_to_affine(E1*, const E1*);
 bool_t      E1_affine_on_curve(const E1*);
-bool_t      E1_in_G1(const ep_t);
+bool_t      E1_in_G1(const E1*);
 int         G1_simple_subgroup_check(const ep_t);
 void        E1_mult(E1*, const E1*, const Fr*);
 void        G1_mult_gen(E1*, const Fr*);
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index e67c3c3bc33..e9b1607a721 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -158,7 +158,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 	// check both serializations of G1 points are equal
 	if flowPass && blstPass {
 		sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381)
-		writePointG1(sigFlowOutBytes, &pointFlow)
+		writePointE1(sigFlowOutBytes, &pointFlow)
 		sigBLSTOutBytes := pointBLST.Compress()
 
 		assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes)

From 8d32cffd9acdf45b4b3461e3cd3abeddab591fd8 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 00:29:17 -0600
Subject: [PATCH 079/200] map to G1 using BLST and add length sanity check of
 input hash

---
 crypto/bls.go                 | 15 ++++++++---
 crypto/bls12381_hashtocurve.c | 23 ++++++++++++----
 crypto/bls12381_utils.c       |  8 ++++--
 crypto/bls12381_utils.go      |  5 +++-
 crypto/bls12381_utils.h       |  6 +++--
 crypto/bls12381_utils_test.go |  6 +++--
 crypto/bls_core.c             | 51 +++++++++++++++--------------------
 crypto/bls_include.h          | 10 +++----
 8 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index d814b1209c2..77164298a1d 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -66,8 +66,6 @@ const (
 	PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on
 
 	// Hash to curve params
-	// expandMsgOutput is the output length of the expand_message step as required by the hash_to_curve algorithm
-	expandMsgOutput = 2 * (fieldSize + (securityBits / 8))
 	// hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_
 	h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_"
 	// scheme implemented as a countermasure for rogue attacks of the form : SchemeTag_
@@ -79,6 +77,12 @@ const (
 	blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag
 )
 
+// expandMsgOutput is the output length of the expand_message step as required by the
+// hash_to_curve algorithm (and the map to G1 step)
+//
+// (Cgo does not export C macros)
+var expandMsgOutput = C.get_mapToG1_input_len()
+
 // blsBLS12381Algo, embeds SignAlgo
 type blsBLS12381Algo struct {
 	// points to Relic context of BLS12-381 with all the parameters
@@ -546,11 +550,14 @@ func (a *blsBLS12381Algo) init() error {
 }
 
 // This is only a TEST/DEBUG/BENCH function.
-// It returns the hash to G1 point from a slice of 128 bytes
+// It returns the hash-to-G1 point from a slice of 128 bytes
 func mapToG1(data []byte) *pointE1 {
 	l := len(data)
 	var h pointE1
-	C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l))
+	ret := C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l))
+	if int(ret) != valid {
+		return nil
+	}
 	return &h
 }
 
diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c
index 3e8217d42e5..bff5e92f468 100644
--- a/crypto/bls12381_hashtocurve.c
+++ b/crypto/bls12381_hashtocurve.c
@@ -327,12 +327,25 @@ static void map_to_G1_local(ep_t p, const uint8_t *msg, int len) {
 }
 #endif
 
-// computes a hash of input data to G1
-// construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
-void map_to_G1(ep_t h, const byte* data, const int len) {
+// maps input `hash` bytes to G1.
+// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
+// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
+int map_to_G1(E1* h, const byte* hash, const int len) {
+    // sanity check of length
+    if (len != MAP_TO_G1_INPUT_LEN) {
+        return INVALID;
+    }
+
     #if hashToPoint==LOCAL_SSWU
     map_to_G1_local(h, data, len);
-    #elif hashToPoint==RELIC_SSWU
-    ep_map_from_field(h, data, len);
+
+    #elif hashToPoint==BLST_SSWU
+    // map to field elements
+    Fr u[2];
+    map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
+    map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
+    // map field elements to G1
+    map_to_g1(h, (POINTonE1 *)&u[0], (POINTonE1 *)&u[1]);
     #endif
+    return VALID;
 }
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index c9fca01df9a..97c26b57713 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -25,6 +25,10 @@ int get_Fr_BYTES() {
     return Fr_BYTES;
 }
 
+int get_mapToG1_input_len() {
+    return MAP_TO_G1_INPUT_LEN;
+}
+
 
 // Initializes Relic context with BLS12-381 parameters
 ctx_t* relic_init_BLS12_381() { 
@@ -411,7 +415,7 @@ static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) {
    return sqrt_fp((limb_t*)res, (limb_t*)a);
 }
 
-static bool check_Fp(const Fp* in) {
+static bool Fp_check(const Fp* in) {
     // use same method as in BLST internal function
     // which seems the most efficient. The method uses the assembly-based 
     // modular addition instead of limbs comparison
@@ -453,7 +457,7 @@ BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
     }
     limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES);
     // compare read scalar to p
-    if (!check_Fp(a)) {
+    if (!Fp_check(a)) {
         return BLST_BAD_ENCODING;
     }       
     return BLST_SUCCESS;
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index bf6e4f996d0..103577013cc 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -302,7 +302,10 @@ func hashToG1Bytes(data, dst []byte) []byte {
 
 	// map the hash to G1
 	var point pointE1
-	C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
+	ret := C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
+	if int(ret) != valid {
+		return nil
+	}
 
 	// serialize the point
 	pointBytes := make([]byte, signatureLengthBLSBLS12381)
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 50bec52f133..58023376c45 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -10,6 +10,7 @@
 #include "relic.h"
 #include "blst_include.h"
 
+#define SEC_BITS  128
 #define VALID     RLC_OK
 #define INVALID   RLC_ERR
 #define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR
@@ -21,7 +22,6 @@
 #define MIN(a,b) ((a)>(b)?(b):(a))
 
 // Fields and Group serialization lengths
-#define SEC_BITS  128
 #define Fp_BITS   381
 #define Fp2_BYTES (2*Fp_BYTES)
 #define Fp_LIMBS  BITS_TO_LIMBS(Fp_BITS)
@@ -88,12 +88,14 @@ ep2_st* E2_blst_to_relic(const E2* x);
 int      get_valid();
 int      get_invalid();
 int      get_Fr_BYTES();
+int      get_mapToG1_input_len();
 
 // BLS based SPoCK
 int bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 
 // hash to curve functions (functions in bls12381_hashtocurve.c)
-void     map_to_G1(ep_t, const byte*, const int);
+#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
+int     map_to_G1(E1*, const byte*, const int);
 
 // Fr utilities
 extern const Fr BLS12_381_rR;
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 9a9026e4056..3fa827d2cc9 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -69,6 +69,7 @@ func TestMapToG1(t *testing.T) {
 
 	for i, msg := range msgs {
 		pointBytes := hashToG1Bytes(msg, dst)
+		require.NonNil(t, pointBytes)
 
 		expectedPointBytes, err := hex.DecodeString(expectedPointString[i])
 		require.NoError(t, err)
@@ -80,15 +81,16 @@ func TestMapToG1(t *testing.T) {
 
 // Hashing to G1 bench
 func BenchmarkMapToG1(b *testing.B) {
-
 	input := make([]byte, expandMsgOutput)
 	for i := 0; i < len(input); i++ {
 		input[i] = byte(i)
 	}
 	b.ResetTimer()
+	var p *pointE1
 	for i := 0; i < b.N; i++ {
-		mapToG1(input)
+		p = mapToG1(input)
 	}
+	require.NonNil(b, p)
 	b.StopTimer()
 }
 
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 5ea02115d66..7b3021b84a1 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -19,23 +19,6 @@ int get_sk_len() {
     return SK_LEN;
 }
 
-// Checks if input point p is in the subgroup G1. 
-// The function assumes the input is known to be on the curve E1.
-bool_t E1_in_G1(const ep_t p){
-// TODO: to upadte
-/*
-    #if MEMBERSHIP_CHECK_G1 == EXP_ORDER
-    return G1_simple_subgroup_check(p);
-    #elif MEMBERSHIP_CHECK_G1 == BOWE
-    // section 3.2 from https://eprint.iacr.org/2019/814.pdf
-    return bowe_subgroup_check_G1(p);
-    #else
-    return UNDEFINED;
-    #endif
-*/
-    return VALID;
-}
-
 // Computes a BLS signature from a G1 point 
 static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) {
     ep_t p;
@@ -49,14 +32,19 @@ static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) {
 }
 
 // Computes a BLS signature from a hash
-void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
+// `data` represents the hashed message with length `len` equal to
+//  `MAP_TO_G1_INPUT_LEN`. 
+int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
     ep_t h;
     ep_new(h);
     // hash to G1
-    map_to_G1(h, data, len);
+    if (map_to_G1(h, data, len) != VALID) {
+        return INVALID;
+    }
     // s = h^sk
     bls_sign_ep(s, sk, h);
     ep_free(h);
+    return VALID;
 }
 
 // Verifies a BLS signature (G1 point) against a public key (G2 point)
@@ -67,23 +55,25 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
-    // elemsG1[0] = s
     ep_new(elemsG1[0]);
-    ep_copy(elemsG1[0], (ep_st*)s);
-
-    // elemsG1[1] = h
     ep_new(elemsG1[1]);
-    // hash to G1 
-    map_to_G1(elemsG1[1], data, len); 
+    ep2_new(elemsG2[1]);
+    ep2_new(&elemsG2[0]);
 
-    ep2_st* pk_tmp = E2_blst_to_relic(pk);
+    int ret = UNDEFINED;
+
+    // elemsG1[0] = s
+    ep_copy(elemsG1[0], (ep_st*)s);
 
     // elemsG2[1] = pk
-    ep2_new(elemsG2[1]);
+    ep2_st* pk_tmp = E2_blst_to_relic(pk);
     ep2_copy(elemsG2[1], pk_tmp);
-    ep2_new(&elemsG2[0]);
 
-    int ret = UNDEFINED;
+    // elemsG1[1] = h
+    if (map_to_G1(elemsG1[1], data, len) != VALID) {
+        ret = INVALID;
+        goto out;
+    }
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -321,7 +311,8 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // Verifies a BLS signature in a byte buffer.
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
-// the membership check in G2 is separated to allow optimizing multiple verifications using the same key.
+// the membership check in G2 is separated to optimize multiple verifications using the same key.
+// `data` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
 int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {  
     ep_t s;
     ep_new(s);
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index d0f9120beb2..0da961feae2 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -21,18 +21,16 @@
 #define SINGLE_PAIRING (DOUBLE_PAIRING^1)
 
 // algorithm choice for hashing to G1 
-// both methods are similar implementations of the same optimized SSWU 
-// but offer different timings.
-#define RELIC_SSWU 1  // relic library implementation
-#define LOCAL_SSWU 2       // local implementation 
-#define hashToPoint LOCAL_SSWU
+#define BLST_SSWU 1       // BLST implementation
+#define LOCAL_SSWU 2      // local implementation 
+#define hashToPoint BLST_SSWU
 
 // bls core (functions in bls_core.c)
 int      get_signature_len();
 int      get_pk_len();
 int      get_sk_len();  
 
-void     bls_sign(byte*, const Fr*, const byte*, const int);
+int      bls_sign(byte*, const Fr*, const byte*, const int);
 int      bls_verify(const E2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
                          const uint32_t*, const E2*);

From 9efaddb65595f0311ccda0f4ea82ee62f143a4f7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 00:40:13 -0600
Subject: [PATCH 080/200] remove local SSWU implementation and rely only on
 BLST map to G1 tools

---
 crypto/bls.go                 |   5 +-
 crypto/bls12381_hashtocurve.c | 351 ----------------------------------
 crypto/bls12381_utils.c       |  39 ++--
 crypto/bls12381_utils.go      |   3 +-
 crypto/bls12381_utils.h       |  21 --
 crypto/bls12381_utils_test.go |   4 +-
 crypto/bls_include.h          |   5 -
 7 files changed, 22 insertions(+), 406 deletions(-)
 delete mode 100644 crypto/bls12381_hashtocurve.c

diff --git a/crypto/bls.go b/crypto/bls.go
index 77164298a1d..43f42f1115d 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -81,7 +81,7 @@ const (
 // hash_to_curve algorithm (and the map to G1 step)
 //
 // (Cgo does not export C macros)
-var expandMsgOutput = C.get_mapToG1_input_len()
+var expandMsgOutput = int(C.get_mapToG1_input_len())
 
 // blsBLS12381Algo, embeds SignAlgo
 type blsBLS12381Algo struct {
@@ -554,8 +554,7 @@ func (a *blsBLS12381Algo) init() error {
 func mapToG1(data []byte) *pointE1 {
 	l := len(data)
 	var h pointE1
-	ret := C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l))
-	if int(ret) != valid {
+	if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid {
 		return nil
 	}
 	return &h
diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c
deleted file mode 100644
index bff5e92f468..00000000000
--- a/crypto/bls12381_hashtocurve.c
+++ /dev/null
@@ -1,351 +0,0 @@
-// +build relic
-
-#include "bls12381_utils.h"
-#include "bls_include.h"
-
-extern prec_st* bls_prec;
-
-#if (hashToPoint== LOCAL_SSWU)
-
-// These constants are taken from https://github.com/kwantam/bls12-381_hash 
-// and converted to the Mongtomery domain. 
-// Copyright 2019 Riad S. Wahby
-const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS] = {
-    {0x4d18b6f3af00131c, 0x19fa219793fee28c, 0x3f2885f1467f19ae,
-     0x23dcea34f2ffb304, 0xd15b58d2ffc00054, 0x0913be200a20bef4,},
-    {0x898985385cdbbd8b, 0x3c79e43cc7d966aa, 0x1597e193f4cd233a,
-     0x8637ef1e4d6623ad, 0x11b22deed20d827b, 0x07097bc5998784ad,},
-    {0xa542583a480b664b, 0xfc7169c026e568c6, 0x5ba2ef314ed8b5a6,
-     0x5b5491c05102f0e7, 0xdf6e99707d2a0079, 0x0784151ed7605524,},
-    {0x494e212870f72741, 0xab9be52fbda43021, 0x26f5577994e34c3d,
-     0x049dfee82aefbd60, 0x65dadd7828505289, 0x0e93d431ea011aeb,},
-    {0x90ee774bd6a74d45, 0x7ada1c8a41bfb185, 0x0f1a8953b325f464,
-     0x104c24211be4805c, 0x169139d319ea7a8f, 0x09f20ead8e532bf6,},
-    {0x6ddd93e2f43626b7, 0xa5482c9aa1ccd7bd, 0x143245631883f4bd,
-     0x2e0a94ccf77ec0db, 0xb0282d480e56489f, 0x18f4bfcbb4368929,},
-    {0x23c5f0c953402dfd, 0x7a43ff6958ce4fe9, 0x2c390d3d2da5df63,
-     0xd0df5c98e1f9d70f, 0xffd89869a572b297, 0x1277ffc72f25e8fe,},
-    {0x79f4f0490f06a8a6, 0x85f894a88030fd81, 0x12da3054b18b6410,
-     0xe2a57f6505880d65, 0xbba074f260e400f1, 0x08b76279f621d028,},
-    {0xe67245ba78d5b00b, 0x8456ba9a1f186475, 0x7888bff6e6b33bb4,
-     0xe21585b9a30f86cb, 0x05a69cdcef55feee, 0x09e699dd9adfa5ac,},
-    {0x0de5c357bff57107, 0x0a0db4ae6b1a10b2, 0xe256bb67b3b3cd8d,
-     0x8ad456574e9db24f, 0x0443915f50fd4179, 0x098c4bf7de8b6375,},
-    {0xe6b0617e7dd929c7, 0xfe6e37d442537375, 0x1dafdeda137a489e,
-     0xe4efd1ad3f767ceb, 0x4a51d8667f0fe1cf, 0x054fdf4bbf1d821c,},
-    {0x72db2a50658d767b, 0x8abf91faa257b3d5, 0xe969d6833764ab47,
-     0x464170142a1009eb, 0xb14f01aadb30be2f, 0x18ae6a856f40715d,},
-};
-
-const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS] = {
-    {0x2b567ff3e2837267, 0x1d4d9e57b958a767, 0xce028fea04bd7373,
-     0xcc31a30a0b6cd3df, 0x7d7b18a682692693, 0x0d300744d42a0310,},
-    {0x99c2555fa542493f, 0xfe7f53cc4874f878, 0x5df0608b8f97608a,
-     0x14e03832052b49c8, 0x706326a6957dd5a4, 0x0a8dadd9c2414555,},
-    {0x13d942922a5cf63a, 0x357e33e36e261e7d, 0xcf05a27c8456088d,
-     0x0000bd1de7ba50f0, 0x83d0c7532f8c1fde, 0x13f70bf38bbf2905,},
-    {0x5c57fd95bfafbdbb, 0x28a359a65e541707, 0x3983ceb4f6360b6d,
-     0xafe19ff6f97e6d53, 0xb3468f4550192bf7, 0x0bb6cde49d8ba257,},
-    {0x590b62c7ff8a513f, 0x314b4ce372cacefd, 0x6bef32ce94b8a800,
-     0x6ddf84a095713d5f, 0x64eace4cb0982191, 0x0386213c651b888d,},
-    {0xa5310a31111bbcdd, 0xa14ac0f5da148982, 0xf9ad9cc95423d2e9,
-     0xaa6ec095283ee4a7, 0xcf5b1f022e1c9107, 0x01fddf5aed881793,},
-    {0x65a572b0d7a7d950, 0xe25c2d8183473a19, 0xc2fcebe7cb877dbd,
-     0x05b2d36c769a89b0, 0xba12961be86e9efb, 0x07eb1b29c1dfde1f,},
-    {0x93e09572f7c4cd24, 0x364e929076795091, 0x8569467e68af51b5,
-     0xa47da89439f5340f, 0xf4fa918082e44d64, 0x0ad52ba3e6695a79,},
-    {0x911429844e0d5f54, 0xd03f51a3516bb233, 0x3d587e5640536e66,
-     0xfa86d2a3a9a73482, 0xa90ed5adf1ed5537, 0x149c9c326a5e7393,},
-    {0x462bbeb03c12921a, 0xdc9af5fa0a274a17, 0x9a558ebde836ebed,
-     0x649ef8f11a4fae46, 0x8100e1652b3cdc62, 0x1862bd62c291dacb,},
-    {0x05c9b8ca89f12c26, 0x0194160fa9b9ac4f, 0x6a643d5a6879fa2c,
-     0x14665bdd8846e19d, 0xbb1d0d53af3ff6bf, 0x12c7e1c3b28962e5,},
-    {0xb55ebf900b8a3e17, 0xfedc77ec1a9201c4, 0x1f07db10ea1a4df4,
-     0x0dfbd15dc41a594d, 0x389547f2334a5391, 0x02419f98165871a4,},
-    {0xb416af000745fc20, 0x8e563e9d1ea6d0f5, 0x7c763e17763a0652,
-     0x01458ef0159ebbef, 0x8346fe421f96bb13, 0x0d2d7b829ce324d2,},
-    {0x93096bb538d64615, 0x6f2a2619951d823a, 0x8f66b3ea59514fa4,
-     0xf563e63704f7092f, 0x724b136c4cf2d9fa, 0x046959cfcfd0bf49,},
-    {0xea748d4b6e405346, 0x91e9079c2c02d58f, 0x41064965946d9b59,
-     0xa06731f1d2bbe1ee, 0x07f897e267a33f1b, 0x1017290919210e5f,},
-    {0x872aa6c17d985097, 0xeecc53161264562a, 0x07afe37afff55002,
-     0x54759078e5be6838, 0xc4b92d15db8acca8, 0x106d87d1b51d13b9,},
-};
-
-// sqrt_ration optimized for p mod 4 = 3.
-// Check if (U/V) is a square, return 1 if yes, 0 otherwise 
-// If 1 is returned, out contains sqrt(U/V),
-// otherwise out is sqrt(z*U/V)
-// out should not be the same as U, or V
-static int sqrt_ratio_3mod4(fp_t out, const fp_t u, const fp_t v) {
-    fp_t t0, t1, t2;
-
-    fp_sqr(t1, v);                               // V^2
-    fp_mul(t2, u, v);                            // U*V
-    fp_mul(t1, t1, t2);                          // U*V^3
-    fp_exp(out, t1, &bls_prec->p_3div4);         // (U*V^3)^((p-3)/4)
-    fp_mul(out, out, t2);                        // (U*V)*(U*V^3)^((p-3)/4) = U^((p+1)/4) * V^(3p-5)/4 
-
-    fp_sqr(t0, out);     // out^2
-    fp_mul(t0, t0, v);   // out^2 * V
-
-    int res = 1;
-    if (fp_cmp(t0, u) != RLC_EQ) {               // check whether U/V is a quadratic residue
-        fp_mul(out, out, bls_prec->sqrt_z);      // sqrt(-z)*U*V(UV^3)^((p-3)/4)
-        res = 0;
-    }
-    
-    return res;
-}
-
-// returns 1 if input is odd and 0 if input is even
-static int sign_0(const fp_t in) {
-#if FP_RDC == MONTY
-    bn_t tmp;
-    fp_prime_back(tmp, in); // TODO: entire reduction may not be needed to get the parity
-    return bn_is_even(tmp);
-#endif
-    return in[0]&1;
-}
-
-// Maps the field element t to a point p in E1(Fp) where E1: y^2 = g(x) = x^3 + a1*x + b1 
-// using optimized non-constant-time Simplified SWU implementation (A.B = 0)
-// Outout point p is in Jacobian coordinates to avoid extra inversions.
-static inline void map_to_E1_osswu(ep_t p, const fp_t t) {
-    fp_t t0, t1, t2, t3, t4;
-
-    // get the isogeny map coefficients
-    ctx_t* ctx = core_get();
-    fp_t *a1 = &ctx->ep_iso.a;
-    fp_t *b1 = &ctx->ep_iso.b;
-    fp_t *z = &ctx->ep_map_u;
-
-    // compute numerator and denominator of X0(t) = N / D
-    fp_sqr(t1, t);                            // t^2
-    fp_mul(t1, t1, *z);                       // z * t^2
-    fp_sqr(t2, t1);                           // z^2 * t^4
-    fp_add(t2, t2, t1);                       // z * t^2 + z^2 * t^4   
-    fp_add(t3, t2, bls_prec->r);              // z * t^2 + z^2 * t^4 + 1
-    fp_mul(t3, t3, *b1);                      // N = b * (z * t^2 + z^2 * t^4 + 1)
- 
-    if (fp_is_zero(t2)) {
-        fp_copy(p->z, bls_prec->a1z);         // D = a * z
-    } else {
-        fp_mul(p->z, t2, bls_prec->minus_a1); // D = - a * (z * t^2 + z^2 * t^4)
-    }
-
-    // compute numerator and denominator of g(X0(t)) = U / V 
-    // U = N^3 + a1 * N * D^2 + b1 * D^3
-    // V = D^3
-    fp_sqr(t2, t3);                        // N^2
-    fp_sqr(t0, p->z);                      // D^2
-    fp_mul(t4, *a1, t0);                   // a * D^2
-    fp_add(t2, t4, t2);                    // N^2 + a * D^2
-    fp_mul(t2, t3, t2);                    // N^3 + a * N * D^2
-    fp_mul(t0, t0, p->z);                  // V  =  D^3
-    fp_mul(t4, *b1, t0);                   // b * V = b * D^3
-    fp_add(t2, t4, t2);                    // U = N^3 + a1 * N * D^2 + b1 * D^3
-
-    // compute sqrt(U/V)
-    int is_sqr = sqrt_ratio_3mod4(p->y, t2, t0);
-    if (is_sqr) {
-        fp_copy(p->x, t3);      // x = N
-    } else {
-        fp_mul(p->x, t1, t3);   // x = N * z * t^2
-        fp_mul(t1, t1, t);      // z * t^3
-        fp_mul(p->y, p->y, t1); // y = z * t^3 * sqrt(r * U/V) where r is 1 or map coefficient z
-    }
-
-    // negate y to be the same sign of t
-    if (sign_0(t) != sign_0(p->y)) {
-        fp_neg(p->y, p->y);   // -y
-    }
-
-    // convert (x/D, y) into Jacobian (X,Y,Z) where Z=D to avoid inversion.
-    // Z = D, X = x/D * D^2 = x*D , Y = y*D^3  
-    fp_mul(p->x, p->x, p->z);             // X = N*D
-    fp_mul(p->y, p->y, t0);               // Y = y*D^3
-    // p->z is already equal to D 
-    p->coord = JACOB;
-}
-
-// This code is taken from https://github.com/kwantam/bls12-381_hash 
-// and adapted to use Relic modular arithemtic.  
-// Copyright 2019 Riad S. Wahby
-static inline void hornerPolynomial(fp_t accumulator, const fp_t x, const int start_val, const fp_t fp_tmp[]) {
-    for (int i = start_val; i >= 0; --i) {
-        fp_mul(accumulator, accumulator, x);            // acc *= x 
-        fp_add(accumulator, accumulator, fp_tmp[i]);    // acc += next_val 
-    }
-}
-
-// This code is taken from https://github.com/kwantam/bls12-381_hash 
-// and adapted to use Relic modular arithemtic.  
-// Copyright 2019 Riad S. Wahby
-static inline void compute_map_zvals(fp_t out[], const fp_t inv[], const fp_t zv[], const unsigned len) {
-    for (unsigned i = 0; i < len; ++i) {
-        fp_mul(out[i], inv[i], zv[i]);
-    }
-}
-
-// 11-isogeny map
-// computes the mapping of p and stores the result in r
-//
-// This code is taken from https://github.com/kwantam/bls12-381_hash 
-// and adapted to use Relic modular arithemtic. The constant tables 
-// iso_D and iso_N were converted to the Montgomery domain. 
-//
-// Copyright 2019 Riad S. Wahby
-// Licensed under the Apache License, Version 2.0 (the "License");
-//    you may not use this file except in compliance with the License.
-//    You may obtain a copy of the License at
-
-//        http://www.apache.org/licenses/LICENSE-2.0
-
-//    Unless required by applicable law or agreed to in writing, software
-//    distributed under the License is distributed on an "AS IS" BASIS,
-//    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//    See the License for the specific language governing permissions and
-//    limitations under the License.
-static inline void eval_iso11(ep_t r, const ep_t  p) {
-    fp_t fp_tmp[32];
-
-    // precompute even powers of Z up to Z^30 in fp_tmp[31]..fp_tmp[17]
-    fp_sqr(fp_tmp[31], p->z);                       // Z^2
-    fp_sqr(fp_tmp[30], fp_tmp[31]);                 // Z^4
-    fp_mul(fp_tmp[29], fp_tmp[30], fp_tmp[31]);     // Z^6
-    fp_sqr(fp_tmp[28], fp_tmp[30]);                 // Z^8
-    fp_mul(fp_tmp[27], fp_tmp[28], fp_tmp[31]);     // Z^10
-    fp_sqr(fp_tmp[26], fp_tmp[29]);                 // Z^12
-    fp_mul(fp_tmp[25], fp_tmp[26], fp_tmp[31]);     // Z^14
-    fp_sqr(fp_tmp[24], fp_tmp[28]);                 // Z^16
-    fp_mul(fp_tmp[23], fp_tmp[24], fp_tmp[31]);     // Z^18
-    fp_sqr(fp_tmp[22], fp_tmp[27]);                 // Z^20
-    fp_mul(fp_tmp[21], fp_tmp[22], fp_tmp[31]);     // Z^22
-    fp_sqr(fp_tmp[20], fp_tmp[26]);                 // Z^24
-    fp_mul(fp_tmp[19], fp_tmp[20], fp_tmp[31]);     // Z^26
-    fp_sqr(fp_tmp[18], fp_tmp[25]);                 // Z^28
-    fp_mul(fp_tmp[17], fp_tmp[18], fp_tmp[31]);     // Z^30
-
-    // get isogeny map coefficients
-    iso_t iso = ep_curve_get_iso();
-    // hardcode the constant to avoid warnings of gcc -Wstringop-overread
-    const int deg_dy = 15; // also equal to iso->deg_yd;
-    const int deg_dx = 10; // also equal to iso->deg_xd;
-    // TODO: get N coefficient from Relic and update N computations
-
-    // y = Ny/Dy
-    // compute Dy
-    compute_map_zvals(fp_tmp, iso->yd, fp_tmp + 17, deg_dy);     // k_(15-i) Z^(2i)
-    fp_add(fp_tmp[16], p->x, fp_tmp[deg_dy - 1]);               // X + k_14 Z^2 
-    hornerPolynomial(fp_tmp[16], p->x, deg_dy - 2, fp_tmp);    // Horner for the rest
-    fp_mul(fp_tmp[15], fp_tmp[16], fp_tmp[31]);                    // Dy * Z^2
-    fp_mul(fp_tmp[15], fp_tmp[15], p->z);                           // Dy * Z^3
-
-    // compute Ny
-    compute_map_zvals(fp_tmp, bls_prec->iso_Ny, fp_tmp + 17, ELLP_Ny_LEN - 1); // k_(15-i) Z^(2i)
-    fp_mul(fp_tmp[16], p->x, bls_prec->iso_Ny[ELLP_Ny_LEN - 1]);      // k_15 * X
-    fp_add(fp_tmp[16], fp_tmp[16], fp_tmp[ELLP_Ny_LEN - 2]);  // k_15 * X + k_14 Z^2
-    hornerPolynomial(fp_tmp[16], p->x, ELLP_Ny_LEN - 3, fp_tmp);     // Horner for the rest
-    fp_mul(fp_tmp[16], fp_tmp[16], p->y);                           // Ny * Y
-    
-    // x = Nx/Dx
-    // compute Dx
-    compute_map_zvals(fp_tmp, iso->xd, fp_tmp + 22, deg_dx);         // k_(10-i) Z^(2i)
-    fp_add(fp_tmp[14], p->x, fp_tmp[deg_dx - 1]);  // X + k_9 Z^2 
-    hornerPolynomial(fp_tmp[14], p->x, deg_dx - 2, fp_tmp);    // Horner for the rest
-    fp_mul(fp_tmp[14], fp_tmp[14], fp_tmp[31]);                    // Dx * Z^2
-
-    // compute Nx
-    compute_map_zvals(fp_tmp, bls_prec->iso_Nx, fp_tmp + 21, ELLP_Nx_LEN - 1);      // k_(11-i) Z^(2i)
-    fp_mul(fp_tmp[13], p->x, bls_prec->iso_Nx[ELLP_Nx_LEN - 1]);   // k_11 * X
-    fp_add(fp_tmp[13], fp_tmp[13], fp_tmp[ELLP_Nx_LEN - 2]);  // k_11 * X + k_10 * Z^2
-    hornerPolynomial(fp_tmp[13], p->x, ELLP_Nx_LEN - 3, fp_tmp);      // Dy: Horner for the rest
-
-    // compute the resulting point (Xo,Yo,Zo)
-    fp_mul(r->z, fp_tmp[14], fp_tmp[15]);  // Zo = Dx Dy
-    fp_mul(r->x, fp_tmp[13], fp_tmp[15]);  //  Nx Dy
-    fp_mul(r->x, r->x, r->z);    // Xo = Nx Dy Z 
-    fp_sqr(fp_tmp[12], r->z);                // Zo^2
-    fp_mul(r->y, fp_tmp[16], fp_tmp[14]);  // Ny Dx
-    fp_mul(r->y, r->y, fp_tmp[12]);   // Yo = Ny Dx Zo^2
-    r->coord = JACOB;
-}
-
-// map an input point in E to a point in G1 by clearing the cofactor of G1 
-static void clear_cofactor(ep_t out, const ep_t in) {
-    bn_t z;
-    bn_new(z);
-    fp_prime_get_par(z);
-    // compute 1-z 
-    bn_neg(z, z); 
-    bn_add_dig(z, z, 1);
-    ep_mul_dig(out, in, z->dp[0]); // z fits in 64 bits
-    bn_free(z);
-}
-
-// construction 2 section 5 in in https://eprint.iacr.org/2019/403.pdf
-// evaluate the optimized SSWU map twice, add resulting points, apply isogeny map, clear cofactor
-// the result is stored in p
-// msg is the input message to hash, must be at least 2*(FP_BYTES+16) = 128 bytes
-static void map_to_G1_local(ep_t p, const uint8_t *msg, int len) {
-    RLC_TRY {
-        if (len < 2*(Fp_BYTES+16)) {
-            RLC_THROW(ERR_NO_BUFFER);
-        }
-
-        fp_t t1, t2;
-        bn_t tmp;
-        bn_new(tmp);
-        bn_read_bin(tmp, msg, len/2);
-        fp_prime_conv(t1, tmp);
-        bn_read_bin(tmp, msg + len/2, len - len/2);
-        fp_prime_conv(t2, tmp);
-        bn_free(tmp);
-
-        ep_t p_temp;
-        ep_new(p_temp);
-        // first mapping
-        map_to_E1_osswu(p_temp, t1); // map to E1
-        eval_iso11(p_temp, p_temp); // map to E
-
-        // second mapping
-        map_to_E1_osswu(p, t2); // map to E1
-        eval_iso11(p, p); // map to E
-        // sum 
-        // TODO: implement point addition in E1 and apply the isogeny map only once.
-        // Gives 4% improvement for map-to-curve overall
-        ep_add_jacob(p, p, p_temp);
-        
-        // clear the cofactor
-        clear_cofactor(p, p); // map to G1
-        ep_free(p_temp);
-    }
-    RLC_CATCH_ANY {
-		RLC_THROW(ERR_CAUGHT);
-	}
-}
-#endif
-
-// maps input `hash` bytes to G1.
-// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
-// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
-int map_to_G1(E1* h, const byte* hash, const int len) {
-    // sanity check of length
-    if (len != MAP_TO_G1_INPUT_LEN) {
-        return INVALID;
-    }
-
-    #if hashToPoint==LOCAL_SSWU
-    map_to_G1_local(h, data, len);
-
-    #elif hashToPoint==BLST_SSWU
-    // map to field elements
-    Fr u[2];
-    map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
-    map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
-    // map field elements to G1
-    map_to_g1(h, (POINTonE1 *)&u[0], (POINTonE1 *)&u[1]);
-    #endif
-    return VALID;
-}
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 97c26b57713..13b9f948bed 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -61,12 +61,6 @@ ctx_t* relic_init_BLS12_381() {
 prec_st bls_prec_st;
 prec_st* bls_prec = NULL;
 
-// required constants for the optimized SWU hash to curve
-#if (hashToPoint == LOCAL_SSWU)
-extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS];
-extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS];
-#endif
-
 #if (MEMBERSHIP_CHECK_G1 == BOWE)
 extern const uint64_t beta_data[Fp_LIMBS];
 extern const uint64_t z2_1_by3_data[2];
@@ -83,27 +77,11 @@ void precomputed_data_set(const prec_st* p) {
 
 // pre-compute some data required for curve BLS12-381
 prec_st* init_precomputed_data_BLS12_381() {
-
     bls_prec = &bls_prec_st;
     ctx_t* ctx = core_get();
 
     // (p-1)/2
     bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2);
-    #if (hashToPoint == LOCAL_SSWU)
-    // (p-3)/4
-    bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2);
-    // sqrt(-z)
-    fp_neg(bls_prec->sqrt_z, ctx->ep_map_u);
-    fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z);
-    // -a1 and a1*z
-    fp_neg(bls_prec->minus_a1, ctx->ep_iso.a);
-    fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u);
-    
-    for (int i=0; i<ELLP_Nx_LEN; i++)  
-        fp_read_raw(bls_prec->iso_Nx[i], iso_Nx_data[i]);
-    for (int i=0; i<ELLP_Ny_LEN; i++)  
-        fp_read_raw(bls_prec->iso_Ny[i], iso_Ny_data[i]);
-    #endif
 
     #if (MEMBERSHIP_CHECK_G1 == BOWE)
     bn_new(&bls_prec->beta);
@@ -881,6 +859,23 @@ void G1_mult_gen(E1* res, const Fr* expo) {
     vec_zero(&tmp, sizeof(tmp));
 }
 
+// maps bytes input `hash` to G1.
+// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
+// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
+int map_to_G1(E1* h, const byte* hash, const int len) {
+    // sanity check of length
+    if (len != MAP_TO_G1_INPUT_LEN) {
+        return INVALID;
+    }
+    // map to field elements
+    Fr u[2];
+    map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
+    map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
+    // map field elements to G1
+    map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]);
+    return VALID;
+}
+
 // ------------------- E2 utilities
 
 // TODO: to delete
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 103577013cc..253d8904ca1 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -302,8 +302,7 @@ func hashToG1Bytes(data, dst []byte) []byte {
 
 	// map the hash to G1
 	var point pointE1
-	ret := C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash)))
-	if int(ret) != valid {
+	if C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) != valid {
 		return nil
 	}
 
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 58023376c45..cddab7d5edc 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -48,29 +48,8 @@
 #define MEMBERSHIP_CHECK_G2 EXP_ORDER
 
 
-// constants used in the optimized SWU hash to curve
-#if (hashToPoint == LOCAL_SSWU)
-    #define ELLP_Nx_LEN 12
-    #define ELLP_Dx_LEN 10
-    #define ELLP_Ny_LEN 16
-    #define ELLP_Dy_LEN 15
-#endif
-
-
 // Structure of precomputed data
 typedef struct prec_ {
-    #if (hashToPoint == LOCAL_SSWU)
-    // constants needed in optimized SSWU
-    bn_st p_3div4;
-    fp_st sqrt_z;
-    // related hardcoded constants for faster access,
-    // where a1 is the coefficient of isogenous curve E1
-    fp_st minus_a1;
-    fp_st a1z;
-    // coefficients of the isogeny map
-    fp_st iso_Nx[ELLP_Nx_LEN];
-    fp_st iso_Ny[ELLP_Ny_LEN];
-    #endif
     #if  (MEMBERSHIP_CHECK_G1 == BOWE)
     bn_st beta;
     bn_st z2_1_by3;
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 3fa827d2cc9..b5c142ad1bb 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -69,7 +69,7 @@ func TestMapToG1(t *testing.T) {
 
 	for i, msg := range msgs {
 		pointBytes := hashToG1Bytes(msg, dst)
-		require.NonNil(t, pointBytes)
+		require.NotNil(t, pointBytes)
 
 		expectedPointBytes, err := hex.DecodeString(expectedPointString[i])
 		require.NoError(t, err)
@@ -90,7 +90,7 @@ func BenchmarkMapToG1(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		p = mapToG1(input)
 	}
-	require.NonNil(b, p)
+	require.NotNil(b, p)
 	b.StopTimer()
 }
 
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 0da961feae2..079172aa221 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -20,11 +20,6 @@
 #define DOUBLE_PAIRING 1
 #define SINGLE_PAIRING (DOUBLE_PAIRING^1)
 
-// algorithm choice for hashing to G1 
-#define BLST_SSWU 1       // BLST implementation
-#define LOCAL_SSWU 2      // local implementation 
-#define hashToPoint BLST_SSWU
-
 // bls core (functions in bls_core.c)
 int      get_signature_len();
 int      get_pk_len();

From 56081df0edffb4d11a12aad3ca7b347e3264dee1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 00:42:31 -0600
Subject: [PATCH 081/200] clean up membership check macros and delete Bowe's
 check code

---
 crypto/bls12381_utils.c | 68 -----------------------------------------
 crypto/bls12381_utils.h | 13 --------
 2 files changed, 81 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 13b9f948bed..ee2b23f2085 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -61,11 +61,6 @@ ctx_t* relic_init_BLS12_381() {
 prec_st bls_prec_st;
 prec_st* bls_prec = NULL;
 
-#if (MEMBERSHIP_CHECK_G1 == BOWE)
-extern const uint64_t beta_data[Fp_LIMBS];
-extern const uint64_t z2_1_by3_data[2];
-#endif
-
 // sets the global variable to input
 void precomputed_data_set(const prec_st* p) {
     bls_prec = (prec_st*)p;
@@ -83,13 +78,6 @@ prec_st* init_precomputed_data_BLS12_381() {
     // (p-1)/2
     bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2);
 
-    #if (MEMBERSHIP_CHECK_G1 == BOWE)
-    bn_new(&bls_prec->beta);
-    bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS);
-    bn_new(&bls_prec->z2_1_by3);
-    bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2);
-    #endif
-
     // Montgomery constant R
     fp_set_dig(bls_prec->r, 1);
     return bls_prec;
@@ -1327,62 +1315,6 @@ int G1_simple_subgroup_check(const ep_t p){
     return VALID;
 }
 
-#if (MEMBERSHIP_CHECK_G1 == BOWE)
-// beta such that beta^3 == 1 mod p
-// beta is in the Montgomery form
-const uint64_t beta_data[Fp_LIMBS] = { 
-    0xcd03c9e48671f071, 0x5dab22461fcda5d2, 0x587042afd3851b95,
-    0x8eb60ebe01bacb9e, 0x03f97d6e83d050d2, 0x18f0206554638741,
-};
-
-
-// (z^2-1)/3 with z being the parameter of bls12-381
-const uint64_t z2_1_by3_data[2] = { 
-    0x0000000055555555, 0x396c8c005555e156  
-};
-
-// uses Bowe's check from section 3.2 from https://eprint.iacr.org/2019/814.pdf
-// to check whether a point on the curve E1 is in G1.
-int bowe_subgroup_check_G1(const ep_t p){
-    if (ep_is_infty(p) == 1) 
-        return VALID;
-    fp_t b;
-    dv_copy(b, beta_data, Fp_LIMBS); 
-    ep_t sigma, sigma2, p_inv;
-    ep_new(sigma);
-    ep_new(sigma2);
-    ep_new(p_inv);
-
-    // si(p) 
-    ep_copy(sigma, p);
-    fp_mul(sigma[0].x, sigma[0].x, b);
-    // -si^2(p)
-    ep_copy(sigma2, sigma);
-    fp_mul(sigma2[0].x, sigma2[0].x, b);
-    fp_neg(sigma2[0].y, sigma2[0].y);
-    ep_dbl(sigma, sigma);
-    // -p
-    ep_copy(p_inv, p);
-    fp_neg(p_inv[0].y, p_inv[0].y);
-    // (z^2-1)/3 (2*si(p) - p - si^2(p)) - si^2(p)
-    ep_add(sigma, sigma, p_inv);
-    ep_add(sigma, sigma, sigma2);
-    // TODO: multiplication using a chain?
-    ep_mul_lwnaf(sigma, sigma, &bls_prec->z2_1_by3);
-    ep_add(sigma, sigma, sigma2);
-    
-    ep_free(sigma2);
-    ep_free(p_inv);
-    // check result against infinity
-    if (!ep_is_infty(sigma)){
-        ep_free(sigma);
-        return INVALID;
-    }
-    ep_free(sigma);
-    return VALID;
-}
-#endif
-
 /*
 // maps the bytes to a point in G1
 // this is a testing file only, should not be used in any protocol!
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index cddab7d5edc..12ae39db9ee 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -41,19 +41,9 @@
 #define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
 #define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
-// Subgroup membership check method
-#define EXP_ORDER 0
-#define BOWE 1
-#define MEMBERSHIP_CHECK_G1 BOWE
-#define MEMBERSHIP_CHECK_G2 EXP_ORDER
-
 
 // Structure of precomputed data
 typedef struct prec_ {
-    #if  (MEMBERSHIP_CHECK_G1 == BOWE)
-    bn_st beta;
-    bn_st z2_1_by3;
-    #endif
     // other field-related constants
     bn_st p_1div2;
     fp_t r;   // Montgomery multiplication constant
@@ -122,9 +112,6 @@ void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
 void     map_bytes_to_G1(E1*, const uint8_t*, int);
 void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
-#if  (MEMBERSHIP_CHECK_G1 == BOWE)
-int      bowe_subgroup_check_G1(const ep_t);
-#endif
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);

From 13025e169bfd5f80ad59fcdfdf5e69c0c51f9cdd Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 11:23:05 -0600
Subject: [PATCH 082/200] refactor bls_verify_ep to use hashed point

---
 crypto/bls_core.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 7b3021b84a1..7f9b6e508ae 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -48,10 +48,10 @@ int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 }
 
 // Verifies a BLS signature (G1 point) against a public key (G2 point)
-// and a message data.
-// The signature and public key are assumed to be in G1 and G2 respectively. This 
+// and a message hash `h` (G1 point).
+// Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) {    
+static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) {    
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -70,10 +70,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int
     ep2_copy(elemsG2[1], pk_tmp);
 
     // elemsG1[1] = h
-    if (map_to_G1(elemsG1[1], data, len) != VALID) {
-        ret = INVALID;
-        goto out;
-    }
+    ep_copy(elemsG1[1], h);
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -314,8 +311,8 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // the membership check in G2 is separated to optimize multiple verifications using the same key.
 // `data` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
 int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {  
-    ep_t s;
-    ep_new(s);
+    ep_t s, h;
+    ep_new(s) ep_new(h);
     
     // deserialize the signature into a curve point
     int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN);
@@ -327,8 +324,12 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {
     if (E1_in_G1(s) != VALID) {
         return INVALID;
     }
+
+    if (map_to_G1(h, data, len) != VALID) {
+        return INVALID;
+    }
     
-    return bls_verify_ep(pk, s, data, len);
+    return bls_verify_ep(pk, s, h);
 }
 
 
@@ -413,10 +414,9 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
 }
 
 // verify the binary tree and fill the results using recursive batch verifications.
-static void bls_batch_verify_tree(const node* root, const int len, byte* results, 
-        const byte* data, const int data_len) {
+static void bls_batch_verify_tree(const node* root, const int len, byte* results, const ep_t h) {
     // verify the aggregated signature against the aggregated public key.
-    int res =  bls_verify_ep(root->pk, root->sig, data, data_len);
+    int res =  bls_verify_ep(root->pk, root->sig, h);
 
     // if the result is valid, all the subtree signatures are valid.
     if (res == VALID) {
@@ -436,8 +436,8 @@ static void bls_batch_verify_tree(const node* root, const int len, byte* results
     // use the binary tree structure to find the invalid signatures. 
     int right_len = len/2;
     int left_len = len - right_len;
-    bls_batch_verify_tree(root->left, left_len, &results[0], data, data_len);
-    bls_batch_verify_tree(root->right, right_len, &results[left_len], data, data_len);
+    bls_batch_verify_tree(root->left, left_len, &results[0], h);
+    bls_batch_verify_tree(root->right, right_len, &results[left_len], h);
 }
 
 // Batch verifies the validity of a multiple BLS signatures of the 
@@ -503,11 +503,19 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
     node* root = build_tree(sigs_len, &pks[0], &sigs[0]);
     if (!root) goto out;
 
+    ep_t h;
+    ep_new(h);
+    if (map_to_G1(h, data, data_len) != VALID) {
+        goto out_map;
+    }
+
     // verify the binary tree and fill the results using batch verification
-    bls_batch_verify_tree(root, sigs_len, &results[0], data, data_len);
+    bls_batch_verify_tree(root, sigs_len, &results[0], h);
     // free the allocated tree 
     free_tree(root);
-    
+
+out_map: 
+    ep_free(h);
 out:
     bn_free(r);  
     for (int i=0; i < sigs_len; i++) {

From 8885e5d71d851f8750922c0a4b375a991e73e67f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 13:43:07 -0600
Subject: [PATCH 083/200] add temporary E1_blst_to_relic and use E1 in all
 bls_core functions - but pairing

---
 crypto/bls.go           |   2 +-
 crypto/bls12381_utils.c |  59 ++++++++-------
 crypto/bls12381_utils.h |   4 +-
 crypto/bls_core.c       | 160 ++++++++++++++++++++--------------------
 crypto/spock.go         |   8 +-
 5 files changed, 120 insertions(+), 113 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 43f42f1115d..a8caa047ee7 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -320,7 +320,7 @@ const invalidBLSSignatureHeader = byte(0xE0)
 // makes the verification fail early. The verification would return (false, nil).
 func BLSInvalidSignature() Signature {
 	signature := make([]byte, SignatureLenBLSBLS12381)
-	signature[0] = invalidBLSSignatureHeader // invalid header as per C.ep_read_bin_compact
+	signature[0] = invalidBLSSignatureHeader // invalid header as per the Zcash serialization
 	return signature
 }
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index ee2b23f2085..0211aa6e1a5 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -87,8 +87,10 @@ prec_st* init_precomputed_data_BLS12_381() {
 
 // Montgomery constant R related to the curve order r
 // R mod r = (1<<256)%r 
-const Fr BLS12_381_rR = { TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
-                          TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), };
+const Fr BLS12_381_rR = {  \
+    TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
+    TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \
+    };
 
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
@@ -560,7 +562,17 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) {
     Fp_write_bytes(bin + Fp_BYTES, &imag(a));
 }
 
-// ------------------- G1 utilities
+// ------------------- E1 utilities
+
+// TODO: temp utility function to delete
+ep_st* E1_blst_to_relic(const E1* x) {
+    ep_st* out = (ep_st*)malloc(sizeof(ep_st)); 
+    byte* data = (byte*)malloc(G1_SER_BYTES);
+    E1_write_bytes(data, x);
+    ep_read_bin_compact(out, data, G1_SER_BYTES);
+    free(data);
+    return out;
+}
 
 // TODO: to delete, only used by temporary E2_blst_to_relic
 int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
@@ -831,6 +843,11 @@ void E1_write_bytes(byte *bin, const E1* a) {
     bin[0] |= (G1_SERIALIZATION << 7);
 }
 
+// generic point addition that must handle doubling and points at infinity
+void E1_add(E1* res, const E1* a, const E1* b) {
+    POINTonE1_dadd((POINTonE1*)res, (POINTonE1*)a, (POINTonE1*)b, NULL); 
+}
+
 // Exponentiation of a generic point `a` in E1, res = expo.a
 void E1_mult(E1* res, const E1* p, const Fr* expo) {
     pow256 tmp;
@@ -839,6 +856,14 @@ void E1_mult(E1* res, const E1* p, const Fr* expo) {
     vec_zero(&tmp, sizeof(tmp));
 }
 
+// computes the sum of the E1 array elements `y[i]` and writes it in `sum`.
+void E1_sum_vector(E1* sum, const E1* y, const int len){
+    E1_set_infty(sum);
+    for (int i=0; i<len; i++){
+        E1_add(sum, sum, &y[i]);
+    }
+}
+
 // Exponentiation of generator g1 of G1, res = expo.g1
 void G1_mult_gen(E1* res, const Fr* expo) {
     pow256 tmp;
@@ -1162,11 +1187,11 @@ bool_t E2_in_G2(const E2* p){
     return POINTonE2_in_G2((const POINTonE2*)p);
 }
 
-// computes the sum of the G2 array elements y and writes the sum in jointy
-void E2_sum_vector(E2* jointy, const E2* y, const int len){
-    E2_set_infty(jointy);
+// computes the sum of the E2 array elements `y[i]` and writes it in `sum`
+void E2_sum_vector(E2* sum, const E2* y, const int len){
+    E2_set_infty(sum);
     for (int i=0; i<len; i++){
-        E2_add(jointy, jointy, &y[i]);
+        E2_add(sum, sum, &y[i]);
     }
 }
 
@@ -1178,7 +1203,7 @@ void E2_sum_vector(E2* jointy, const E2* y, const int len){
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications 
 // using the same public keys.
-int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
+/*int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -1244,7 +1269,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         return INVALID;
     }
     return UNDEFINED;
-}
+}*/
 
 // Subtracts all G2 array elements `y` from an element `x` and writes the 
 // result in res
@@ -1299,22 +1324,6 @@ int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) {
     return error;
 }
 
-// uses a simple scalar multiplication by G1's order
-// to check whether a point on the curve E1 is in G1.
-int G1_simple_subgroup_check(const ep_t p){
-    ep_t inf;
-    ep_new(inf);
-    // check p^order == infinity
-    // use basic double & add as lwnaf reduces the expo modulo r
-    ep_mul_basic(inf, p, &core_get()->ep_r);
-    if (!ep_is_infty(inf)){
-        ep_free(inf);
-        return INVALID;
-    }
-    ep_free(inf);
-    return VALID;
-}
-
 /*
 // maps the bytes to a point in G1
 // this is a testing file only, should not be used in any protocol!
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 12ae39db9ee..6d091fec86b 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -100,12 +100,14 @@ bool_t      E1_is_infty(const E1*);
 void        E1_to_affine(E1*, const E1*);
 bool_t      E1_affine_on_curve(const E1*);
 bool_t      E1_in_G1(const E1*);
-int         G1_simple_subgroup_check(const ep_t);
 void        E1_mult(E1*, const E1*, const Fr*);
+void        E1_add(E1*, const E1*, const E1*);
+void        E1_sum_vector(E1*, const E1*, const int);
 void        G1_mult_gen(E1*, const Fr*);
 BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
 
+ep_st*      E1_blst_to_relic(const E1* x);
 int      ep_read_bin_compact(ep_t, const byte *, const int);
 void     ep_write_bin_compact(byte *, const ep_t,  const int);
 void     ep_sum_vector(ep_t, ep_st*, const int);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 7f9b6e508ae..f020ba968c7 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -19,31 +19,26 @@ int get_sk_len() {
     return SK_LEN;
 }
 
-// Computes a BLS signature from a G1 point 
-static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) {
-    ep_t p;
-    ep_new(p);
-
-    // s = h^sk
-    //ep_mult(p, h, sk);
-    ep_copy(p, h);
-    ep_write_bin_compact(s, p, SIGNATURE_LEN);
-    ep_free(p);
+// Computes a BLS signature from a G1 point and writes it in `out`.
+// `out` must be allocated properly with `G1_SER_BYTES` bytes.
+static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) {
+    // s = h^s
+    E1 s;
+    E1_mult(&s, h, sk);
+    E1_write_bytes(out, &s);
 }
 
-// Computes a BLS signature from a hash
-// `data` represents the hashed message with length `len` equal to
-//  `MAP_TO_G1_INPUT_LEN`. 
-int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
-    ep_t h;
-    ep_new(h);
+// Computes a BLS signature from a hash and writes it in `out`.
+// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
+// `out` must be allocated properly with `G1_SER_BYTES` bytes.
+int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) {
     // hash to G1
-    if (map_to_G1(h, data, len) != VALID) {
+    E1 h;
+    if (map_to_G1(&h, hash, len) != VALID) {
         return INVALID;
     }
     // s = h^sk
-    bls_sign_ep(s, sk, h);
-    ep_free(h);
+    bls_sign_ep(out, sk, &h);
     return VALID;
 }
 
@@ -51,7 +46,7 @@ int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) {
 // and a message hash `h` (G1 point).
 // Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) {    
+static int bls_verify_ep(const E2* pk, const E1* s, const E1* h) {    
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -63,14 +58,16 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) {
     int ret = UNDEFINED;
 
     // elemsG1[0] = s
-    ep_copy(elemsG1[0], (ep_st*)s);
+    ep_st* s_tmp = E1_blst_to_relic(s);
+    ep_copy(elemsG1[0], s_tmp);
 
     // elemsG2[1] = pk
     ep2_st* pk_tmp = E2_blst_to_relic(pk);
     ep2_copy(elemsG2[1], pk_tmp);
 
     // elemsG1[1] = h
-    ep_copy(elemsG1[1], h);
+    ep_st* h_tmp = E1_blst_to_relic(h);
+    ep_copy(elemsG1[1], h_tmp);
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -g2
@@ -142,12 +139,16 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     }
 
     // elemsG1[0] = sig
-    ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN);
-    if (ret != RLC_OK) goto out;
+    E1 s;
+    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+        ret = INVALID;
+        goto out;
+    }
 
     // check s is in G1
-    ret = E1_in_G1(elemsG1[0]);
-    if (ret != VALID) goto out;
+    if (!E1_in_G1(&s)) goto out;
+    ep_st* s_tmp = E1_blst_to_relic(&s);
+    ep_copy(elemsG1[0], s_tmp);
 
     // elemsG2[0] = -g2
     ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded 
@@ -157,7 +158,10 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     for (int i=1; i < nb_hashes+1; i++) {
         // elemsG1[i] = h
         // hash to G1 
-        map_to_G1(elemsG1[i], &hashes[offset], len_hashes[i-1]); 
+        E1 h;
+        map_to_G1(&h, &hashes[offset], len_hashes[i-1]);
+        ep_st* h_tmp = (ep_st*) E1_blst_to_relic(&h);
+        ep_copy(elemsG1[i], h_tmp); 
         offset += len_hashes[i-1];
     }
 
@@ -230,12 +234,19 @@ int bls_verifyPerDistinctKey(const byte* sig,
     }
 
     // elemsG1[0] = s
-    ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN);
-    if (ret != RLC_OK) goto out;
+    E1 s;
+    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+        ret = INVALID;
+        goto out;
+    }
 
     // check s in G1
-    ret = E1_in_G1(elemsG1[0]); 
-    if (ret != VALID) goto out;
+    if (!E1_in_G1(&s)){
+        ret = INVALID;
+        goto out;
+    } 
+    ep_st* s_tmp = E1_blst_to_relic(&s);
+    ep_copy(elemsG1[0], s_tmp);
 
     // elemsG2[0] = -g2
     ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded 
@@ -252,17 +263,18 @@ int bls_verifyPerDistinctKey(const byte* sig,
     // tmp_hashes is a temporary array of all hashes under a same key mapped to a G1 point.
     // tmp_hashes size is set to the maximum possible size to minimize malloc calls.
     int tmp_hashes_size = hashes_per_pk[0];
-    for (int i=1; i<nb_pks; i++) 
-        if (hashes_per_pk[i] > tmp_hashes_size) 
+    for (int i=1; i<nb_pks; i++) {
+        if (hashes_per_pk[i] > tmp_hashes_size) {
             tmp_hashes_size = hashes_per_pk[i];
-    ep_st* tmp_hashes = (ep_st*)malloc(tmp_hashes_size * sizeof(ep_st));
+        }
+    }
+    E1* tmp_hashes = (E1*)malloc(tmp_hashes_size * sizeof(E1));
     if (!tmp_hashes) {
         ret = UNDEFINED;
         goto out;
     }
 
     // sum hashes under the same key
-    for (int i=0; i<tmp_hashes_size; i++) ep_new(&tmp_hashes[i]);
     int data_offset = 0;
     int index_offset = 0;
     for (int i=1; i < nb_pks+1; i++) {
@@ -273,7 +285,10 @@ int bls_verifyPerDistinctKey(const byte* sig,
             index_offset++; 
         }
         // aggregate all the points of the array 
-        ep_sum_vector(elemsG1[i], tmp_hashes, hashes_per_pk[i-1]);
+        E1 sum;
+        E1_sum_vector(&sum, tmp_hashes, hashes_per_pk[i-1]);
+        ep_st* sum_tmp = E1_blst_to_relic(&sum);
+        ep_copy(elemsG1[i], sum_tmp);
     }
     for (int i=0; i<tmp_hashes_size; i++) ep_free(&tmp_hashes[i]);
     free(tmp_hashes);
@@ -309,27 +324,25 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
 // the membership check in G2 is separated to optimize multiple verifications using the same key.
-// `data` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
-int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {  
-    ep_t s, h;
-    ep_new(s) ep_new(h);
+// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
+int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int len) {  
+    E1 s, h;
     
     // deserialize the signature into a curve point
-    int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN);
-    if (read_ret != RLC_OK) {
-        return read_ret;
+    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+        return INVALID;
     }
 
     // check s is in G1
-    if (E1_in_G1(s) != VALID) {
+    if (!E1_in_G1(&s)) {
         return INVALID;
     }
 
-    if (map_to_G1(h, data, len) != VALID) {
+    if (map_to_G1(&h, hash, len) != VALID) {
         return INVALID;
     }
     
-    return bls_verify_ep(pk, s, h);
+    return bls_verify_ep(pk, &s, &h);
 }
 
 
@@ -338,17 +351,17 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) {
 // being the aggregated signature of the two children's signature (resp. public keys).
 // The leaves contain the initial signatures and public keys.
 typedef struct st_node { 
-    ep_st* sig;
+    E1* sig;
     E2* pk;  
     struct st_node* left; 
     struct st_node* right; 
 } node;
 
-static node* new_node(const E2* pk, const ep_st* sig){
+static node* new_node(const E2* pk, const E1* sig){
     node* t = (node*) malloc(sizeof(node));
     if (t) {
         t->pk = (E2*)pk;
-        t->sig = (ep_st*)sig;
+        t->sig = (E1*)sig;
         t->right = t->left = NULL;
     }
     return t;
@@ -374,7 +387,7 @@ static void free_tree(node* root) {
 }
 
 // builds a binary tree of aggregation of signatures and public keys recursively.
-static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
+static node* build_tree(const int len, const E2* pks, const E1* sigs) {
     // check if a leaf is reached
     if (len == 1) {
         return new_node(&pks[0], &sigs[0]);  // use the first element of the arrays
@@ -386,13 +399,12 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
 
     // create a new node with new points
     E2* new_pk = (E2*)malloc(sizeof(E2));
-    if (!new_pk) goto error;
-    ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st));
-    if (!new_sig) goto error_sig;
+    if (!new_pk) {goto error;}
+    E1* new_sig = (E1*)malloc(sizeof(E1));
+    if (!new_sig) {goto error_sig;}
 
     node* t = new_node(new_pk, new_sig);
     if (!t) goto error_node;
-    ep_new(t->sig);
 
     // build the tree in a top-down way
     t->left = build_tree(left_len, &pks[0], &sigs[0]);
@@ -401,7 +413,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
     t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]);
     if (!t->right) { free_tree(t); goto error; }
     // sum the children
-    ep_add_jacob(t->sig, t->left->sig, t->right->sig);
+    E1_add(t->sig, t->left->sig, t->right->sig);
     E2_add(t->pk, t->left->pk, t->right->pk); 
     return t;
 
@@ -414,7 +426,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) {
 }
 
 // verify the binary tree and fill the results using recursive batch verifications.
-static void bls_batch_verify_tree(const node* root, const int len, byte* results, const ep_t h) {
+static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) {
     // verify the aggregated signature against the aggregated public key.
     int res =  bls_verify_ep(root->pk, root->sig, h);
 
@@ -460,11 +472,8 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
     // build the arrays of G1 and G2 elements to verify
     E2* pks = (E2*) malloc(sigs_len * sizeof(E2));
     if (!pks) return;
-    ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st));
+    E1* sigs = (E1*) malloc(sigs_len * sizeof(E1));
     if (!sigs) goto out_sigs;
-    for (int i=0; i < sigs_len; i++) {
-        ep_new(sigs[i]);
-    }
 
     for (int i=0; i < sigs_len; i++) {
         // convert the signature points:
@@ -472,15 +481,12 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
         // the tree aggregations remain valid.
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
-        int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (read_ret != RLC_OK || E1_in_G1(&sigs[i]) != VALID) {
-            if (read_ret == UNDEFINED) {// unexpected error case 
-                goto out;
-            };
+        int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
+        if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) {
             // set signature and key to infinity (no effect on the aggregation tree)
             // and set result to invalid (result won't be overwritten)
             E2_set_infty(&pks[i]);
-            ep_set_infty(&sigs[i]);   
+            E1_set_infty(&sigs[i]);   
             results[i] = INVALID; 
         } else {
             // choose a random non-zero coefficient of at least 128 bits
@@ -494,33 +500,23 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
             Fr_add(&r, &r, &one); 
             // multiply public key and signature by the same random exponent r
             E2_mult(&pks[i], &pks_input[i], &r);  // TODO: faster version for short expos?
-            bn_st* tmp = Fr_blst_to_relic(&r);
-            ep_mul_lwnaf(&sigs[i], &sigs[i], tmp);   
-            free(tmp); 
+            E1_mult(&sigs[i], &sigs[i], &r);   
         } 
     }
     // build a binary tree of aggreagtions
     node* root = build_tree(sigs_len, &pks[0], &sigs[0]);
     if (!root) goto out;
 
-    ep_t h;
-    ep_new(h);
-    if (map_to_G1(h, data, data_len) != VALID) {
-        goto out_map;
+    E1 h;
+    if (map_to_G1(&h, data, data_len) != VALID) {
+        goto out;
     }
 
     // verify the binary tree and fill the results using batch verification
-    bls_batch_verify_tree(root, sigs_len, &results[0], h);
+    bls_batch_verify_tree(root, sigs_len, &results[0], &h);
     // free the allocated tree 
-    free_tree(root);
-
-out_map: 
-    ep_free(h);
+    free_tree(root); 
 out:
-    bn_free(r);  
-    for (int i=0; i < sigs_len; i++) {
-        ep_free(sigs[i]);
-    }
     free(sigs); 
 out_sigs:
     free(pks);
diff --git a/crypto/spock.go b/crypto/spock.go
index 4fbd974c27f..dad711d9163 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -90,10 +90,10 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 	}
 
 	// verify the spock proof using the secret data
-	verif := C.bls_spock_verify((*C.E2)(&blsPk1.point),
-		(*C.uchar)(&proof1[0]),
-		(*C.E2)(&blsPk2.point),
-		(*C.uchar)(&proof2[0]))
+	verif := valid /*:= C.bls_spock_verify((*C.E2)(&blsPk1.point),
+	(*C.uchar)(&proof1[0]),
+	(*C.E2)(&blsPk2.point),
+	(*C.uchar)(&proof2[0]))*/
 
 	switch verif {
 	case invalid:

From 9e938a97725a50d67754c046af379bdb1460c796 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 17:14:11 -0600
Subject: [PATCH 084/200] implement mapping to Fp to use in map_to_G1

---
 crypto/bls12381_utils.c       | 44 ++++++++++++++++++++++++-----------
 crypto/bls12381_utils.h       | 12 +++++-----
 crypto/bls12381_utils_test.go |  5 +---
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 0211aa6e1a5..15791a6dc56 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -87,10 +87,10 @@ prec_st* init_precomputed_data_BLS12_381() {
 
 // Montgomery constant R related to the curve order r
 // R mod r = (1<<256)%r 
-const Fr BLS12_381_rR = {  \
+const Fr BLS12_381_rR = {{  \
     TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \
-    };
+    }};
 
 // TODO: temp utility function to delete
 bn_st* Fr_blst_to_relic(const Fr* x) {
@@ -187,6 +187,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
 // if base = b*R, res = b^expo * R
 // In general, res = base^expo * R^(-expo+1)
 // `expo` is encoded as a little-endian limb_t table of length `expo_len`.
+// TODO: clean up?
 void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) {
     // mask of the most significant bit
 	const limb_t msb_mask =  (limb_t)1<<((sizeof(limb_t)<<3)-1);
@@ -307,6 +308,7 @@ void Fr_write_bytes(byte *bin, const Fr* a) {
 
 // maps big-endian bytes into an Fr element using modular reduction
 // Input is byte-big-endian, output is Fr (internally vec256)
+// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0);
 static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 {
     Fr digit, radix;
@@ -336,7 +338,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 // Reads a scalar from an array and maps it to Fr using modular reduction.
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
-bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) {
+bool_t map_bytes_to_Fr(Fr* a, const byte* bin, int len) {
     Fr_from_be_bytes(a, bin, len);
     return Fr_is_zero(a);
 }
@@ -443,7 +445,7 @@ void Fp_write_bytes(byte *bin, const Fp* a) {
 // Unlike Relic's versions, the function does not reduce the read integer modulo p and does
 // not throw an exception for an integer larger than p. The function returns RLC_OK if the input
 // corresponds to a field element, and returns RLC_ERR otherwise. 
-static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) {
+static int fp_read_bin_safe(fp_t a, const byte *bin, int len) {
     if (len != Fp_BYTES) {
         return RLC_ERR;
     }
@@ -872,6 +874,20 @@ void G1_mult_gen(E1* res, const Fr* expo) {
     vec_zero(&tmp, sizeof(tmp));
 }
 
+ 
+// Reads a scalar bytes and maps it to Fp using modular reduction.
+// output is in Montgomery form. 
+// `len` must be less or equal to 96 bytes and must be a multiple of 8.
+// This function is only used by `map_to_G1` where input is 64 bytes.
+// input `len` is not checked to satisfy the conditions above.
+static void map_96_bytes_to_Fp(Fp* a, const byte* bin, int len) {
+    vec768 tmp ;
+    vec_zero(&tmp, sizeof(tmp));
+    limbs_from_be_bytes((limb_t*)tmp, bin, len);
+    redc_mont_384((limb_t*)a, tmp, BLS12_381_P, p0); // aR^(-2)
+    Fp_mul_montg(a, a, (Fp*)BLS12_381_RRRR); // aR
+}
+
 // maps bytes input `hash` to G1.
 // `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
 // It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
@@ -881,10 +897,11 @@ int map_to_G1(E1* h, const byte* hash, const int len) {
         return INVALID;
     }
     // map to field elements
-    Fr u[2];
-    map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
-    map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
+    Fp u[2];
+    map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
+    map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
     // map field elements to G1
+    // inputs must be in Montgomery form
     map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]);
     return VALID;
 }
@@ -892,7 +909,7 @@ int map_to_G1(E1* h, const byte* hash, const int len) {
 // ------------------- E2 utilities
 
 // TODO: to delete
-static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) {
+static int fp2_read_bin_safe(fp2_t a, const byte *bin, int len) {
     if (len != Fp2_BYTES) {
         return RLC_ERR;
     }
@@ -1327,7 +1344,7 @@ int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) {
 /*
 // maps the bytes to a point in G1
 // this is a testing file only, should not be used in any protocol!
-void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) {
+void map_bytes_to_G1(ep_t p, const byte* bytes, int len) {
     // map to Fr
     Fr log;
     map_bytes_to_Fr(&log, bytes, len);
@@ -1338,7 +1355,7 @@ void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) {
 
 // generates a point in E1\G1 and stores it in p
 // this is a testing file only, should not be used in any protocol!
-void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) {
+void map_bytes_to_G1complement(ep_t p, const byte* bytes, int len) {
     // generate a random point in E1
     p->coord = BASIC;
     fp_set_dig(p->z, 1);
@@ -1361,7 +1378,7 @@ void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) {
 // maps the bytes to a point in G2.
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
-void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) {
+void map_bytes_to_G2(E2* p, const byte* bytes, int len) {
     assert(len > Fr_BYTES);
     // map to Fr
     Fr log;
@@ -1375,7 +1392,7 @@ void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) {
 // succeeds.
 // For now, function only works when E2 serialization is compressed.
 // this is a testing tool only, it should not be used in any protocol!
-BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) {
+BLST_ERROR map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     assert(G2_SERIALIZATION == COMPRESSED);
     assert(len >= G2_SER_BYTES);
 
@@ -1386,13 +1403,14 @@ BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) {
     copy[0] |= 1<<7;        // set compression bit
     copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
 
-    BLST_ERROR ser = E2_read_bytes(p, copy, len);
+    BLST_ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES);
     if (ser != BLST_SUCCESS) {
         return ser;
     }
 
     // map the point to E2\G2 by clearing G2 order
     E2_mult(p, p, (const Fr*)BLS12_381_r);
+    E2_to_affine(p, p);
 
     assert(E2_affine_on_curve(p));  // sanity check to make sure p is in E2
     return BLST_SUCCESS;
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 6d091fec86b..e3e845f0c19 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -90,8 +90,8 @@ void        Fr_write_bytes(byte *bin, const Fr* a);
 bool_t      map_bytes_to_Fr(Fr*, const byte*, int);
 
 // Fp utilities
-void    Fp_mul_montg(Fp *, const Fp *, const Fp *);
-void    Fp_squ_montg(Fp *, const Fp *);
+void        Fp_mul_montg(Fp *, const Fp *, const Fp *);
+void        Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
 void        E1_copy(E1*, const E1*);
@@ -112,8 +112,8 @@ int      ep_read_bin_compact(ep_t, const byte *, const int);
 void     ep_write_bin_compact(byte *, const ep_t,  const int);
 void     ep_sum_vector(ep_t, ep_st*, const int);
 int      ep_sum_vector_byte(byte*, const byte*, const int);
-void     map_bytes_to_G1(E1*, const uint8_t*, int);
-void     map_bytes_to_G1complement(E1*, const uint8_t*, int);
+void     map_bytes_to_G1(E1*, const byte*, int);
+void     map_bytes_to_G1complement(E1*, const byte*, int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
@@ -131,8 +131,8 @@ void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
 bool_t      E2_in_G2(const E2*);
-void        map_bytes_to_G2(E2*, const uint8_t*, int);
-BLST_ERROR  map_bytes_to_G2complement(E2*, const uint8_t*, int);
+void        map_bytes_to_G2(E2*, const byte*, int);
+BLST_ERROR  map_bytes_to_G2complement(E2*, const byte*, int);
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index b5c142ad1bb..2fc03efe267 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -97,7 +97,7 @@ func BenchmarkMapToG1(b *testing.B) {
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
 	prg := getPRG(t)
-	seed := make([]byte, securityBits/8)
+	seed := make([]byte, PubKeyLenBLSBLS12381)
 	_, err := prg.Read(seed)
 	require.NoError(t, err)
 
@@ -113,9 +113,6 @@ func TestSubgroupCheck(t *testing.T) {
 
 	t.Run("G2", func(t *testing.T) {
 		var p pointE2
-		seed := make([]byte, PubKeyLenBLSBLS12381)
-		_, err := mrand.Read(seed)
-		require.NoError(t, err)
 		mapToG2(&p, seed) // point in G2
 		assert.True(t, checkMembershipG2(&p))
 

From f10d8819dba06569d9c762b9b02eca44b362e6d9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 12 May 2023 19:21:47 -0600
Subject: [PATCH 085/200] fix E1_read_bytes bug and improve debug printing

---
 crypto/bls12381_utils.c      | 78 ++++++++++++++++++++----------------
 crypto/bls12381_utils.h      |  5 ++-
 crypto/bls_core.c            | 23 +++++------
 crypto/bls_crossBLST_test.go |  6 +--
 crypto/bls_test.go           |  1 +
 crypto/sign_test_utils.go    |  1 +
 6 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 15791a6dc56..fd1304e4ca3 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -783,11 +783,10 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     }
 
     // set a.z to 1
-    Fp* a_z = &(a->z); 
-    Fp_set_limb(a_z, 1);
+    Fp_copy(&a->z, &BLS12_381_pR);
 
     if (G1_SERIALIZATION == UNCOMPRESSED) {
-        ret = Fp_read_bytes(&(a->y), bin + Fp_BYTES, sizeof(a->y));
+        ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y));
         if (ret != BLST_SUCCESS){ 
             return ret;
         }
@@ -799,19 +798,16 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     }
     
     // compute the possible square root
-    Fp* a_x = &(a->x);
-    Fp_to_montg(a_x, a_x);
-
-    Fp* a_y = &(a->y);
-    Fp_squ_montg(a_y, a_x);
-    Fp_mul_montg(a_y, a_y, a_x);
-    Fp_add(a_y, a_y, &B_E1);          // B_E1 is already in Montg form             
-    if (!Fp_sqrt_montg(a_y, a_y))           // check whether x^3+b is a quadratic residue
+    Fp_to_montg(&a->x, &a->x);
+    Fp_squ_montg(&a->y, &a->x);
+    Fp_mul_montg(&a->y, &a->y, &a->x);    // x^3
+    Fp_add(&a->y, &a->y, &B_E1);          // B_E1 is already in Montg form             
+    if (!Fp_sqrt_montg(&a->y, &a->y))     // check whether x^3+b is a quadratic residue
         return BLST_POINT_NOT_ON_CURVE; 
 
     // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
-    if (Fp_get_sign(a_y) != y_sign) {
-        Fp_neg(a_y, a_y); // flip y sign if needed
+    if (Fp_get_sign(&a->y) != y_sign) {
+        Fp_neg(&a->y, &a->y); // flip y sign if needed
     }
     return BLST_SUCCESS;
 }
@@ -828,20 +824,18 @@ void E1_write_bytes(byte *bin, const E1* a) {
             return;
     }
     E1 tmp;
-    E1_to_affine(&tmp, a); // TODO: implement
+    E1_to_affine(&tmp, a);
 
-    Fp* t_x = &(tmp.x);
-    Fp_from_montg(t_x, t_x);
-    Fp_write_bytes(bin, t_x);
+    Fp_from_montg(&tmp.x, &tmp.x);
+    Fp_write_bytes(bin, &tmp.x);
 
-    Fp* t_y = &(tmp.y);
     if (G1_SERIALIZATION == COMPRESSED) {
-        bin[0] |= (Fp_get_sign(t_y) << 5);
+        bin[0] |= (Fp_get_sign(&tmp.y) << 5);
     } else {
-        Fp_from_montg(t_y, t_y);
-        Fp_write_bytes(bin + Fp_BYTES, t_y);
+        Fp_from_montg(&tmp.y, &tmp.y);
+        Fp_write_bytes(bin + Fp_BYTES, &tmp.y);
     }
-
+    // compression bit
     bin[0] |= (G1_SERIALIZATION << 7);
 }
 
@@ -1424,10 +1418,12 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int
 
 
 // DEBUG printing functions 
+#define DEBUG 1
+#if DEBUG==1
 void bytes_print_(char* s, byte* data, int len) {
     printf("[%s]:\n", s);
     for (int i=0; i<len; i++) 
-        printf("%02x,", data[i]);
+        printf("%02X,", data[i]);
     printf("\n");
 }
 
@@ -1435,15 +1431,17 @@ void Fr_print_(char* s, Fr* a) {
     printf("[%s]:\n", s);
     limb_t* p = (limb_t*)(a) + Fr_LIMBS;
     for (int i=0; i<Fr_LIMBS; i++) 
-        printf("%16llx", *(--p));
+        printf("%016llX", *(--p));
     printf("\n");
 }
 
-void Fp_print_(char* s, Fp* a) {
+void Fp_print_(char* s, const Fp* a) {
+    Fp tmp;
+    Fp_from_montg(&tmp, a);
     printf("[%s]:\n", s);
-    limb_t* p = (limb_t*)(a) + Fp_LIMBS;
+    limb_t* p = (limb_t*)(&tmp) + Fp_LIMBS;
     for (int i=0; i<Fp_LIMBS; i++) 
-        printf("%16llx", *(--p));
+        printf("%016llX ", *(--p));
     printf("\n");
 }
 
@@ -1453,20 +1451,31 @@ void Fp2_print_(char* s, const Fp2* a) {
     Fp_from_montg(&tmp, &real(a));
     limb_t* p = (limb_t*)(&tmp) + Fp_LIMBS;
     for (int i=0; i<Fp_LIMBS; i++) 
-        printf("%16llx", *(--p));
+        printf("%016llX", *(--p));
     printf("\n");
     Fp_from_montg(&tmp, &imag(a));
     p = (limb_t*)(&tmp) + Fp_LIMBS;
     for (int i=0; i<Fp_LIMBS; i++) 
-        printf("%16llx", *(--p));
+        printf("%016llX", *(--p));
     printf("\n");
 }
 
-void E2_print_(char* s, const E2* a) {
-      printf("[%s]:\n", s);
-      Fp2_print_(".x", &(a->x));
-      Fp2_print_(".y", &(a->y));
-      Fp2_print_(".z", &(a->z));
+void E1_print_(char* s, const E1* p, const int jacob) {
+    E1 a; E1_copy(&a, p);
+    if (!jacob) E1_to_affine(&a, &a);
+    printf("[%s]:\n", s);
+    Fp_print_(".x", &(a.x));
+    Fp_print_(".y", &(a.y));
+    if (jacob) Fp_print_(".z", &(a.z));
+}
+
+void E2_print_(char* s, const E2* p, const int jacob) {
+    E2 a; E2_copy(&a, p);
+    if (!jacob) E2_to_affine(&a, &a);
+    printf("[%s]:\n", s);
+    Fp2_print_(".x", &(a.x));
+    Fp2_print_(".y", &(a.y));
+    if (jacob) Fp2_print_(".z", &(a.z));
 }
  
 
@@ -1493,3 +1502,4 @@ void ep2_print_(char* s, ep2_st* p) {
     printf("[%s]:\n", s);
     g2_print(p);
 }
+#endif
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index e3e845f0c19..4c39a454ab9 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -145,9 +145,10 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int);
 // Debugging related functions
 void     bytes_print_(char*, byte*, int);
 void     Fr_print_(char*, Fr*);
-void     Fp_print_(char*, Fp*);
+void     Fp_print_(char*, const Fp*);
 void     Fp2_print_(char*, const Fp2*);
-void     E2_print_(char*, const E2*);
+void     E1_print_(char*, const E1*, const int);
+void     E2_print_(char*, const E2*, const int);
 void     fp_print_(char*, fp_t);
 void     bn_print_(char*, bn_st*);
 void     ep_print_(char*, ep_st*);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index f020ba968c7..92911d8317a 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -21,7 +21,7 @@ int get_sk_len() {
 
 // Computes a BLS signature from a G1 point and writes it in `out`.
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
-static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) {
+static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) {
     // s = h^s
     E1 s;
     E1_mult(&s, h, sk);
@@ -29,16 +29,16 @@ static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) {
 }
 
 // Computes a BLS signature from a hash and writes it in `out`.
-// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
+// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. 
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
-int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) {
+int bls_sign(byte* out, const Fr* sk, const byte* hash, const int hash_len) {
     // hash to G1
     E1 h;
-    if (map_to_G1(&h, hash, len) != VALID) {
+    if (map_to_G1(&h, hash, hash_len) != VALID) {
         return INVALID;
     }
     // s = h^sk
-    bls_sign_ep(out, sk, &h);
+    bls_sign_E1(out, sk, &h);
     return VALID;
 }
 
@@ -46,7 +46,7 @@ int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) {
 // and a message hash `h` (G1 point).
 // Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This 
 // function only checks the pairing equality. 
-static int bls_verify_ep(const E2* pk, const E1* s, const E1* h) {    
+static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) {    
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
 
@@ -324,10 +324,9 @@ int bls_verifyPerDistinctKey(const byte* sig,
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
 // the membership check in G2 is separated to optimize multiple verifications using the same key.
-// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. 
-int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int len) {  
+// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. 
+int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) {  
     E1 s, h;
-    
     // deserialize the signature into a curve point
     if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
         return INVALID;
@@ -338,11 +337,11 @@ int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int len) {
         return INVALID;
     }
 
-    if (map_to_G1(&h, hash, len) != VALID) {
+    if (map_to_G1(&h, hash, hash_len) != VALID) {
         return INVALID;
     }
     
-    return bls_verify_ep(pk, &s, &h);
+    return bls_verify_E1(pk, &s, &h);
 }
 
 
@@ -428,7 +427,7 @@ static node* build_tree(const int len, const E2* pks, const E1* sigs) {
 // verify the binary tree and fill the results using recursive batch verifications.
 static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) {
     // verify the aggregated signature against the aggregated public key.
-    int res =  bls_verify_ep(root->pk, root->sig, h);
+    int res =  bls_verify_E1(root->pk, root->sig, h);
 
     // if the result is valid, all the subtree signatures are valid.
     if (res == VALID) {
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index e9b1607a721..aabb5d0efaf 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -176,10 +176,10 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 //
 // The test also assumes Flow signature serialization is identical to the one in BLST.
 func testSignHashCrossBLST(t *rapid.T) {
-	// generate two private keys from the same seed
+	// decode two private keys from the same bytes
 	skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte)
-
 	skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes)
+
 	require.NoError(t, err)
 	var skBLST blst.Scalar
 	res := skBLST.Deserialize(skBytes)
@@ -194,7 +194,7 @@ func testSignHashCrossBLST(t *rapid.T) {
 	sigBytesBLST := sigBLST.Compress()
 
 	skFlowBLS, ok := skFlow.(*prKeyBLSBLS12381)
-	require.True(t, ok, "incoherent key type assertion")
+	require.True(t, ok)
 	sigFlow := skFlowBLS.signWithXMDSHA256(message)
 	sigBytesFlow := sigFlow.Bytes()
 
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index c3e9bb6e9db..bd7c1d7a86c 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -30,6 +30,7 @@ func TestBLSMainMethods(t *testing.T) {
 	// This test checks that:
 	//  - signature decoding handles input x-coordinates larger than p (doesn't result in an exception)
 	//  - signature decoding only accepts reduced x-coordinates to avoid signature malleability
+
 	t.Run("invalid x coordinate larger than p", func(t *testing.T) {
 		msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7")
 		require.NoError(t, err)
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 8e2cd1e931f..8362df83c7f 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -106,6 +106,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
 			require.NoError(t, err)
 			assert.False(t, result, fmt.Sprintf(
 				"Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen))
+
 		}
 	})
 }

From d3396c6d381330d7c3bf883a60d0c897471b325d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 12:26:18 -0600
Subject: [PATCH 086/200] update BLS threshold signature with E1 points

---
 crypto/bls12381_utils.c         | 83 +++++++++++++++++++++------------
 crypto/bls_thresholdsign_core.c | 49 +++++++------------
 2 files changed, 69 insertions(+), 63 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index fd1304e4ca3..655b3e3f8e6 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1214,40 +1214,52 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple verifications 
 // using the same public keys.
-/*int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
+int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
     ep_t elemsG1[2];
     ep2_t elemsG2[2];
-
-    // elemsG1[0] = s1
     ep_new(elemsG1[0]);
-    int read_ret = ep_read_bin_compact(elemsG1[0], sig1, SIGNATURE_LEN);
-    if (read_ret != RLC_OK) 
-        return read_ret;
+    ep_new(elemsG1[1]);
+    ep2_new(elemsG2[1]);
+    ep2_new(elemsG2[0]);
+    int ret;
 
+    // elemsG1[0] = s1
+    E1 s;
+    if (E1_read_bytes(&s, sig1, SIGNATURE_LEN) != BLST_SUCCESS) {
+        ret = INVALID;
+        goto out;
+    };
     // check s1 is in G1
-    if (E1_in_G1(elemsG1[0]) != VALID) 
-        return INVALID;
+    if (E1_in_G1(&s) != VALID)  {
+        ret = INVALID;
+        goto out;
+    }
+    ep_st* s_tmp = E1_blst_to_relic(&s);
+    ep_copy(elemsG1[0], s_tmp);
 
     // elemsG1[1] = s2
-    ep_new(elemsG1[1]);
-    read_ret = ep_read_bin_compact(elemsG1[1], sig2, SIGNATURE_LEN);
-    if (read_ret != RLC_OK) 
-        return read_ret;
-
-    // check s2 in G1
-    if (E1_in_G1(elemsG1[1]) != VALID) 
-        return INVALID; 
+    E1 s;
+    if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) {
+        ret = INVALID;
+        goto out;
+    };
+    // check s2 is in G1
+    if (E1_in_G1(&s) != VALID)  {
+        ret = INVALID;
+        goto out;
+    }
+    s_tmp = E1_blst_to_relic(&s);
+    ep_copy(elemsG1[1], s_tmp); 
 
     // elemsG2[1] = pk1
-    ep2_new(elemsG2[1]);
-    ep2_st* tmp = E2_blst_to_relic(pk1);
-    ep2_copy(elemsG2[1], tmp);
+    ep2_st* pk_tmp = E2_blst_to_relic(pk1);
+    ep2_copy(elemsG2[1], pk_tmp);
 
     // elemsG2[0] = pk2
-    ep2_new(elemsG2[0]);
-    tmp = E2_blst_to_relic(pk2);
-    ep2_copy(elemsG2[0], tmp);
-    free(tmp);
+    pk_tmp = E2_blst_to_relic(pk2);
+    ep2_copy(elemsG2[0], pk_tmp);
+    free(pk_tmp);
+    free(s_tmp);
 
 #if DOUBLE_PAIRING  
     // elemsG2[0] = -pk2
@@ -1260,6 +1272,7 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){
 
     // compare the result to 1
     int res = fp12_cmp_dig(pair, 1);
+    fp12_free(pair);
 
 #elif SINGLE_PAIRING   
     fp12_t pair1, pair2;
@@ -1268,19 +1281,27 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){
     pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]);
 
     int res = fp12_cmp(pair1, pair2);
+    fp12_free(pair1); fp12_free(pair2);
 #endif
-    fp12_free(&one);
+
+    if (core_get()->code == RLC_OK) {
+        if (res == RLC_EQ) { 
+            ret = VALID; 
+        }
+        else { 
+            ret = INVALID; 
+        }
+        goto out; 
+    }
+    ret = UNDEFINED;
+
+out:
     ep_free(elemsG1[0]);
     ep_free(elemsG1[1]);
     ep2_free(elemsG2[0]);
     ep2_free(elemsG2[1]);
-    
-    if (core_get()->code == RLC_OK) {
-        if (res == RLC_EQ) return VALID;
-        return INVALID;
-    }
-    return UNDEFINED;
-}*/
+    return ret;
+}
 
 // Subtracts all G2 array elements `y` from an element `x` and writes the 
 // result in res
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 777af1ef5e9..027579d3dae 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -65,28 +65,21 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
 // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) {
+static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const uint8_t indices[], const int len) {
     // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1 
     // where A_i = g1 ^ a_i
 
     // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0)
     // where L is the Lagrange coefficient
     
-    // temp variables
-    ep_t mult;
-    ep_new(mult);         
-    ep_set_infty(dest);
-
+    E1_set_infty(out);
     Fr fr_lagr_coef;
-    for (int i=0; i < len; i++) {
-        Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len);
-        bn_st* bn_lagr_coef = Fr_blst_to_relic(&fr_lagr_coef);
-        ep_mul_lwnaf(mult, &shares[i], bn_lagr_coef);
-        free(bn_lagr_coef);
-        ep_add_jacob(dest, dest, mult);
+    E1 mult; 
+    for (int i=0; i < len; i++) { 
+        Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); 
+        E1_mult(&mult, &shares[i], &fr_lagr_coef);
+        E1_add(out, out, &mult);
     }
-    // free the temp memory
-    ep_free(mult);
 }
 
 // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] 
@@ -94,33 +87,25 @@ static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], c
 // `len` is equal to `t+1` where `t` is the polynomial degree.
 int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) {
     int read_ret;
-    // temp variables
-    ep_t res;
-    ep_new(res);
-    ep_st* ep_shares = malloc(sizeof(ep_t) * len);
-
+    E1* E1_shares = malloc(sizeof(E1) * len);
     for (int i=0; i < len; i++) {
-        ep_new(ep_shares[i]);
-        read_ret = ep_read_bin_compact(&ep_shares[i], &shares[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (read_ret != RLC_OK) goto out;
-            
+        read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
+        if (read_ret != BLST_SUCCESS) {
+            goto out;
+        }
     }
+
     // G1 interpolation at 0
     // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
     // where A_i = g1 ^ a_i
-    E1_lagrange_interpolate_at_zero(res, ep_shares, indices, len);
-
+    E1 res;
+    E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len);
     // export the result
-    ep_write_bin_compact(dest, res, SIGNATURE_LEN);
+    E1_write_bytes(dest, &res);
     read_ret = VALID;
-
 out:
     // free the temp memory
-    ep_free(res);
-    for (int i=0; i < len; i++) {
-        ep_free(ep_shares[i]);
-    } 
-    free(ep_shares); 
+    free(E1_shares); 
     return read_ret;
 }
 

From 557d3a71523fe5cc8d37efe61346b0895d34866e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 12:28:49 -0600
Subject: [PATCH 087/200] uncomment BLST cross check tests

---
 crypto/bls_crossBLST_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index aabb5d0efaf..e4e957ea495 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -217,6 +217,6 @@ func TestCrossBLST(t *testing.T) {
 	rapid.Check(t, testKeyGenCrossBLST)
 	rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST)
 	rapid.Check(t, testEncodeDecodePublicKeyCrossBLST)
-	//rapid.Check(t, testEncodeDecodeG1CrossBLST)   // commented till G1 check is implemented
+	rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented
 	rapid.Check(t, testSignHashCrossBLST)
 }

From 878b7e7fabbde68d584aeacfd9ae62d4d999153a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 12:43:03 -0600
Subject: [PATCH 088/200] spock works with new E1 type

---
 crypto/bls12381_utils.c | 8 +++-----
 crypto/spock.go         | 8 ++++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 655b3e3f8e6..8e770c182f8 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1221,7 +1221,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
     ep_new(elemsG1[1]);
     ep2_new(elemsG2[1]);
     ep2_new(elemsG2[0]);
-    int ret;
+    int ret = UNDEFINED;
 
     // elemsG1[0] = s1
     E1 s;
@@ -1230,7 +1230,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         goto out;
     };
     // check s1 is in G1
-    if (E1_in_G1(&s) != VALID)  {
+    if (!E1_in_G1(&s))  {
         ret = INVALID;
         goto out;
     }
@@ -1238,13 +1238,12 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
     ep_copy(elemsG1[0], s_tmp);
 
     // elemsG1[1] = s2
-    E1 s;
     if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) {
         ret = INVALID;
         goto out;
     };
     // check s2 is in G1
-    if (E1_in_G1(&s) != VALID)  {
+    if (!E1_in_G1(&s))  {
         ret = INVALID;
         goto out;
     }
@@ -1293,7 +1292,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte*
         }
         goto out; 
     }
-    ret = UNDEFINED;
 
 out:
     ep_free(elemsG1[0]);
diff --git a/crypto/spock.go b/crypto/spock.go
index dad711d9163..4fbd974c27f 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -90,10 +90,10 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 	}
 
 	// verify the spock proof using the secret data
-	verif := valid /*:= C.bls_spock_verify((*C.E2)(&blsPk1.point),
-	(*C.uchar)(&proof1[0]),
-	(*C.E2)(&blsPk2.point),
-	(*C.uchar)(&proof2[0]))*/
+	verif := C.bls_spock_verify((*C.E2)(&blsPk1.point),
+		(*C.uchar)(&proof1[0]),
+		(*C.E2)(&blsPk2.point),
+		(*C.uchar)(&proof2[0]))
 
 	switch verif {
 	case invalid:

From 49b01a8f28c49ced5a0489093d48b8db9fe19f1f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 13:07:06 -0600
Subject: [PATCH 089/200] write E1_sum_vector using new E1 type

---
 crypto/bls12381_utils.c | 82 +++++++++++++++++++----------------------
 crypto/bls12381_utils.h | 11 +++---
 crypto/bls_multisig.go  |  4 +-
 3 files changed, 44 insertions(+), 53 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 8e770c182f8..2e56ed6e387 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -860,6 +860,43 @@ void E1_sum_vector(E1* sum, const E1* y, const int len){
     }
 }
 
+// Computes the sum of input signatures (E1 elements) flattened in a single byte array
+// `sigs_bytes` of `sigs_len` bytes.
+// and writes the sum (E1 element) as bytes in `dest`.
+// The function does not check membership of E1 inputs in G1 subgroup.
+// The header is using byte pointers to minimize Cgo calls from the Go layer.
+int E1_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int sigs_len) {
+    int error = UNDEFINED;
+    // sanity check that `len` is multiple of `G1_SER_BYTES`
+    if (sigs_len % G1_SER_BYTES) {
+        error =  INVALID; 
+        goto mem_error;
+    }
+    int n = sigs_len/G1_SER_BYTES; // number of signatures
+    
+    E1* sigs = (E1*) malloc(n * sizeof(E1));
+    if (!sigs) goto mem_error;
+
+    // import the points from the array
+    for (int i=0; i < n; i++) {
+        // deserialize each point from the input array
+        if  (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != BLST_SUCCESS) {
+            error = INVALID; 
+            goto out;
+        }
+    }
+    // sum the points
+    E1 acc;        
+    E1_sum_vector(&acc, sigs, n);
+    // export the result
+    E1_write_bytes(dest, &acc);
+    error = VALID;
+out:
+    free(sigs);
+mem_error:
+    return error;
+}
+
 // Exponentiation of generator g1 of G1, res = expo.g1
 void G1_mult_gen(E1* res, const Fr* expo) {
     pow256 tmp;
@@ -1309,51 +1346,6 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
     E2_add(res, x, res);
 }
 
-// computes the sum of the G1 array elements y and writes the sum in jointy
-void ep_sum_vector(ep_t jointx, ep_st* x, const int len) {
-    ep_set_infty(jointx);
-    for (int i=0; i<len; i++){
-        ep_add_jacob(jointx, jointx, &x[i]);
-    }
-}
-
-// Computes the sum of the signatures (G1 elements) flattened in a single sigs array
-// and writes the sum (G1 element) as bytes in dest.
-// The function assumes sigs is correctly allocated with regards to len.
-int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) {
-    int error = UNDEFINED;
-
-    // temp variables
-    ep_t acc;        
-    ep_new(acc);
-    ep_set_infty(acc);
-    ep_st* sigs = (ep_st*) malloc(len * sizeof(ep_st));
-    if (!sigs) goto mem_error;
-    for (int i=0; i < len; i++) ep_new(sigs[i]);
-
-    // import the points from the array
-    for (int i=0; i < len; i++) {
-        // deserialize each point from the input array
-        error = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
-        if (error != RLC_OK) {
-            goto out;
-        }
-    }
-    // sum the points
-    ep_sum_vector(acc, sigs, len);
-    // export the result
-    ep_write_bin_compact(dest, acc, SIGNATURE_LEN);
-
-    error = VALID;
-out:
-    // free the temp memory
-    ep_free(acc);
-    for (int i=0; i < len; i++) ep_free(sigs[i]);
-    free(sigs);
-mem_error:
-    return error;
-}
-
 /*
 // maps the bytes to a point in G1
 // this is a testing file only, should not be used in any protocol!
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 4c39a454ab9..ffe8fd0f650 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -103,17 +103,16 @@ bool_t      E1_in_G1(const E1*);
 void        E1_mult(E1*, const E1*, const Fr*);
 void        E1_add(E1*, const E1*, const E1*);
 void        E1_sum_vector(E1*, const E1*, const int);
+int         E1_sum_vector_byte(byte*, const byte*, const int);
 void        G1_mult_gen(E1*, const Fr*);
 BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
 
 ep_st*      E1_blst_to_relic(const E1* x);
-int      ep_read_bin_compact(ep_t, const byte *, const int);
-void     ep_write_bin_compact(byte *, const ep_t,  const int);
-void     ep_sum_vector(ep_t, ep_st*, const int);
-int      ep_sum_vector_byte(byte*, const byte*, const int);
-void     map_bytes_to_G1(E1*, const byte*, int);
-void     map_bytes_to_G1complement(E1*, const byte*, int);
+int         ep_read_bin_compact(ep_t, const byte *, const int);
+void        ep_write_bin_compact(byte *, const ep_t,  const int);
+void        map_bytes_to_G1(E1*, const byte*, int);
+void        map_bytes_to_G1complement(E1*, const byte*, int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index ffaf8d637ce..0981103120a 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -115,10 +115,10 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 	aggregatedSig := make([]byte, signatureLengthBLSBLS12381)
 
 	// add the points in the C layer
-	result := C.ep_sum_vector_byte(
+	result := C.E1_sum_vector_byte(
 		(*C.uchar)(&aggregatedSig[0]),
 		(*C.uchar)(&flatSigs[0]),
-		(C.int)(len(sigs)))
+		(C.int)(len(flatSigs)))
 
 	switch result {
 	case valid:

From 534c0a84dc1fdfb3a662eeb696803463a123cc02 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 13:51:34 -0600
Subject: [PATCH 090/200] clean up unsecure mapping to G1/G1 and fix subgroup
 checks

---
 crypto/bls.go                 | 11 --------
 crypto/bls12381_utils.c       | 49 +++++++++++++++++++----------------
 crypto/bls12381_utils.go      | 46 ++++++++++++++++++++------------
 crypto/bls12381_utils.h       |  8 +++---
 crypto/bls12381_utils_test.go | 41 +++++++++++++++--------------
 5 files changed, 82 insertions(+), 73 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index a8caa047ee7..8cfd435b380 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -549,17 +549,6 @@ func (a *blsBLS12381Algo) init() error {
 	return nil
 }
 
-// This is only a TEST/DEBUG/BENCH function.
-// It returns the hash-to-G1 point from a slice of 128 bytes
-func mapToG1(data []byte) *pointE1 {
-	l := len(data)
-	var h pointE1
-	if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid {
-		return nil
-	}
-	return &h
-}
-
 // This is only a TEST function.
 // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field.
 //
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 2e56ed6e387..a0c48795936 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1346,44 +1346,49 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
     E2_add(res, x, res);
 }
 
-/*
-// maps the bytes to a point in G1
+
+// maps the bytes to a point in G1.
+// `len` should be at least Fr_BYTES.
 // this is a testing file only, should not be used in any protocol!
-void map_bytes_to_G1(ep_t p, const byte* bytes, int len) {
+void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
+    assert(len > Fr_BYTES);
     // map to Fr
     Fr log;
     map_bytes_to_Fr(&log, bytes, len);
     // multiplies G1 generator by a random scalar
-
-    
+    G1_mult_gen(p, &log);
 }
 
 // generates a point in E1\G1 and stores it in p
 // this is a testing file only, should not be used in any protocol!
-void map_bytes_to_G1complement(ep_t p, const byte* bytes, int len) {
-    // generate a random point in E1
-    p->coord = BASIC;
-    fp_set_dig(p->z, 1);
-    do {
-        fp_rand(p->x); // set x to a random field element
-        byte r;
-        rand_bytes(&r, 1);
-        fp_zero(p->y);
-        fp_set_bit(p->y, 0, r&1); // set y randomly to 0 or 1
+BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
+    assert(G1_SERIALIZATION == COMPRESSED);
+    assert(len >= G1_SER_BYTES);
+
+    // attempt to deserilize a compressed E1 point from input bytes
+    // after fixing the header 2 bits
+    byte copy[G1_SER_BYTES];
+    memcpy(copy, bytes, sizeof(copy));
+    copy[0] |= 1<<7;        // set compression bit
+    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
+
+    BLST_ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES);
+    if (ser != BLST_SUCCESS) {
+        return ser;
     }
-    while (ep_upk(p, p) == 0); // make sure p is in E1
 
-    // map the point to E1\G1 by clearing G1 order
-    ep_mul_basic(p, p, &core_get()->ep_r);
+    // map the point to E2\G2 by clearing G2 order
+    E1_mult(p, p, (const Fr*)BLS12_381_r);
+    E1_to_affine(p, p);
 
-    assert(ep_on_curve(p));  // sanity check to make sure p is in E1
+    assert(E1_affine_on_curve(p));  // sanity check to make sure p is in E2
+    return BLST_SUCCESS;
 }
-*/
 
 // maps the bytes to a point in G2.
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
-void map_bytes_to_G2(E2* p, const byte* bytes, int len) {
+void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
     assert(len > Fr_BYTES);
     // map to Fr
     Fr log;
@@ -1397,7 +1402,7 @@ void map_bytes_to_G2(E2* p, const byte* bytes, int len) {
 // succeeds.
 // For now, function only works when E2 serialization is compressed.
 // this is a testing tool only, it should not be used in any protocol!
-BLST_ERROR map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
+BLST_ERROR unsecure_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     assert(G2_SERIALIZATION == COMPRESSED);
     assert(len >= G2_SER_BYTES);
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 253d8904ca1..1fb9808edb6 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -256,30 +256,42 @@ func checkMembershipG2(pt *pointE2) bool {
 	return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0)
 }
 
-/*
-// randPointG1 wraps a call to C since cgo can't be used in go test files.
-// It generates a random point in G1 and stores it in input point.
-func randPointG1(pt *pointE1) {
-	C.ep_rand_G1((*C.E1)(pt))
+// This is only a TEST/DEBUG/BENCH function.
+// It returns the hash-to-G1 point from a slice of 128 bytes
+func mapToG1(data []byte) *pointE1 {
+	l := len(data)
+	var h pointE1
+	if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid {
+		return nil
+	}
+	return &h
+}
+
+// mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files.
+// It maps input bytes to a point in G2 and stores it in input point.
+// THIS IS NOT the kind of mapping function that is used in BLS signature.
+func unsecureMapToG1(pt *pointE1, seed []byte) {
+	C.unsecure_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
-// randPointG1Complement wraps a call to C since cgo can't be used in go test files.
-// It generates a random point in E1\G1 and stores it in input point.
-func randPointG1Complement(pt *pointE1) {
-	C.ep_rand_G1complement((*C.E1)(pt))
+// unsecureMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
+// It generates a random point in E2\G2 and stores it in input point.
+func unsecureMapToG1Complement(pt *pointE1, seed []byte) bool {
+	res := C.unsecure_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
+	return int(res) == blst_valid
 }
-*/
 
-// mapToG2 wraps a call to C since cgo can't be used in go test files.
-// It generates a random point in G2 and stores it in input point.
-func mapToG2(pt *pointE2, src []byte) {
-	C.map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src)))
+// unsecureMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files.
+// It maps input bytes to a point in G2 and stores it in input point.
+// THIS IS NOT the kind of mapping function that is used in BLS signature.
+func unsecureMapToG2(pt *pointE2, seed []byte) {
+	C.unsecure_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
-// mapToG2Complement wraps a call to C since cgo can't be used in go test files.
+// unsecureMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func mapToG2Complement(pt *pointE2, src []byte) bool {
-	res := C.map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src)))
+func unsecureMapToG2Complement(pt *pointE2, seed []byte) bool {
+	res := C.unsecure_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 	return int(res) == blst_valid
 }
 
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index ffe8fd0f650..61a6af53069 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -111,8 +111,8 @@ void        E1_write_bytes(byte *, const E1*);
 ep_st*      E1_blst_to_relic(const E1* x);
 int         ep_read_bin_compact(ep_t, const byte *, const int);
 void        ep_write_bin_compact(byte *, const ep_t,  const int);
-void        map_bytes_to_G1(E1*, const byte*, int);
-void        map_bytes_to_G1complement(E1*, const byte*, int);
+void        unsecure_map_bytes_to_G1(E1*, const byte*, int);
+BLST_ERROR  unsecure_map_bytes_to_G1complement(E1*, const byte*, int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
@@ -130,8 +130,8 @@ void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
 bool_t      E2_in_G2(const E2*);
-void        map_bytes_to_G2(E2*, const byte*, int);
-BLST_ERROR  map_bytes_to_G2complement(E2*, const byte*, int);
+void        unsecure_map_bytes_to_G2(E2*, const byte*, int);
+BLST_ERROR  unsecure_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 2fc03efe267..e5207d9f68a 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -101,26 +101,30 @@ func TestSubgroupCheck(t *testing.T) {
 	_, err := prg.Read(seed)
 	require.NoError(t, err)
 
-	/*t.Run("G1", func(t *testing.T) {
+	t.Run("G1", func(t *testing.T) {
 		var p pointE1
-		randPointG1(&p) // point in G1
-		res := checkMembershipG1(&p)
-		assert.Equal(t, res, int(valid))
-		randPointG1Complement(&p) // point in E1\G1
-		res = checkMembershipG1(&p)
-		assert.Equal(t, res, int(invalid))
-	})*/
+		unsecureMapToG1(&p, seed) // point in G1
+		assert.True(t, checkMembershipG1(&p))
+
+		inG1 := false
+		for !inG1 {
+			_, err := prg.Read(seed)
+			require.NoError(t, err)
+			inG1 = unsecureMapToG1Complement(&p, seed) // point in E2\G2
+		}
+		assert.False(t, checkMembershipG1(&p))
+	})
 
 	t.Run("G2", func(t *testing.T) {
 		var p pointE2
-		mapToG2(&p, seed) // point in G2
+		unsecureMapToG2(&p, seed) // point in G2
 		assert.True(t, checkMembershipG2(&p))
 
 		inG2 := false
 		for !inG2 {
-			_, err := mrand.Read(seed)
+			_, err := prg.Read(seed)
 			require.NoError(t, err)
-			inG2 = mapToG2Complement(&p, seed) // point in E2\G2
+			inG2 = unsecureMapToG2Complement(&p, seed) // point in E2\G2
 		}
 		assert.False(t, checkMembershipG2(&p))
 	})
@@ -128,24 +132,23 @@ func TestSubgroupCheck(t *testing.T) {
 
 // subgroup membership check bench
 func BenchmarkSubgroupCheck(b *testing.B) {
+	seed := make([]byte, PubKeyLenBLSBLS12381)
+	_, err := mrand.Read(seed)
+	require.NoError(b, err)
 
-	/*b.Run("G1", func(b *testing.B) {
+	b.Run("G1", func(b *testing.B) {
 		var p pointE1
-		randPointG1(&p)
+		unsecureMapToG1(&p, seed) // point in G1
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG1(&p) // G1
 		}
 		b.StopTimer()
-	})*/
+	})
 
 	b.Run("G2", func(b *testing.B) {
 		var p pointE2
-		seed := make([]byte, PubKeyLenBLSBLS12381)
-		_, err := mrand.Read(seed)
-		require.NoError(b, err)
-		mapToG2(&p, seed) // point in G2
-
+		unsecureMapToG2(&p, seed) // point in G2
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG2(&p) // G2

From c338c934ed53f111cd706a0ba553931731925fb0 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 14:09:35 -0600
Subject: [PATCH 091/200] clean up constants pre-computation

---
 crypto/bls12381_utils.c  | 111 ++++-----------------------------------
 crypto/bls12381_utils.go |   2 -
 crypto/bls12381_utils.h  |  18 +------
 3 files changed, 11 insertions(+), 120 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index a0c48795936..c2058c4148d 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -52,37 +52,10 @@ ctx_t* relic_init_BLS12_381() {
     ep_param_set(B12_P381);
     ep2_curve_set_twist(EP_MTYPE);  // Multiplicative twist 
     #endif 
-
     if (ret != RLC_OK) return NULL;
     return core_get();
 }
 
-// global variable of the pre-computed data
-prec_st bls_prec_st;
-prec_st* bls_prec = NULL;
-
-// sets the global variable to input
-void precomputed_data_set(const prec_st* p) {
-    bls_prec = (prec_st*)p;
-}
-
-// Reads a prime field element from a digit vector in big endian format.
-// There is no conversion to Montgomery domain in this function.
-#define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS)
-
-// pre-compute some data required for curve BLS12-381
-prec_st* init_precomputed_data_BLS12_381() {
-    bls_prec = &bls_prec_st;
-    ctx_t* ctx = core_get();
-
-    // (p-1)/2
-    bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2);
-
-    // Montgomery constant R
-    fp_set_dig(bls_prec->r, 1);
-    return bls_prec;
-}
-
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
@@ -92,27 +65,6 @@ const Fr BLS12_381_rR = {{  \
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \
     }};
 
-// TODO: temp utility function to delete
-bn_st* Fr_blst_to_relic(const Fr* x) {
-    bn_st* out = (bn_st*)malloc(sizeof(bn_st)); 
-    byte* data = (byte*)malloc(Fr_BYTES);
-    be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES);
-    out->alloc = RLC_DV_DIGS;
-    bn_read_bin(out, data, Fr_BYTES);
-    free(data);
-    return out;
-}
-
-// TODO: temp utility function to delete
-Fr* Fr_relic_to_blst(const bn_st* x){
-    Fr* out = (Fr*)malloc(sizeof(Fr)); 
-    byte* data = (byte*)malloc(Fr_BYTES);
-    bn_write_bin(data, Fr_BYTES, x);   
-    Fr_read_bytes(out, data, Fr_BYTES);
-    free(data);
-    return out;
-}
-
 // returns true if a == 0 and false otherwise
 bool_t Fr_is_zero(const Fr* a) {
     return bytes_are_zero((const byte*)a, sizeof(Fr));
@@ -566,18 +518,8 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) {
 
 // ------------------- E1 utilities
 
-// TODO: temp utility function to delete
-ep_st* E1_blst_to_relic(const E1* x) {
-    ep_st* out = (ep_st*)malloc(sizeof(ep_st)); 
-    byte* data = (byte*)malloc(G1_SER_BYTES);
-    E1_write_bytes(data, x);
-    ep_read_bin_compact(out, data, G1_SER_BYTES);
-    free(data);
-    return out;
-}
-
 // TODO: to delete, only used by temporary E2_blst_to_relic
-int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
+static int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     // check the length
     const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
     if (len!=G1_size) {
@@ -641,51 +583,16 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
     return RLC_ERR;
 }
 
-
-// TODO: delete aftet deleting ep_write_bin_compact
-static int fp_get_sign(const fp_t y) {
-    bn_t bn_y;
-    bn_new(bn_y);
-    fp_prime_back(bn_y, y);
-    return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT;		
+// TODO: temp utility function to delete
+ep_st* E1_blst_to_relic(const E1* x) {
+    ep_st* out = (ep_st*)malloc(sizeof(ep_st)); 
+    byte* data = (byte*)malloc(G1_SER_BYTES);
+    E1_write_bytes(data, x);
+    ep_read_bin_compact(out, data, G1_SER_BYTES);
+    free(data);
+    return out;
 }
 
-// TODO: to delete, only used by temporary E2_blst_to_relic
-void ep_write_bin_compact(byte *bin, const ep_t a, const int len) {
-    const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
-
-    if (len!=G1_size) {
-        RLC_THROW(ERR_NO_BUFFER);
-        return;
-    }
- 
-    if (ep_is_infty(a)) {
-            // set the infinity bit
-            bin[0] = (G1_SERIALIZATION << 7) | (1<<6);
-            memset(bin+1, 0, G1_size-1);
-            return;
-    }
-
-    RLC_TRY {
-        ep_t t;
-        ep_null(t);
-        ep_new(t); 
-        ep_norm(t, a);
-        fp_write_bin(bin, Fp_BYTES, t->x);
-
-        if (G1_SERIALIZATION == COMPRESSED) {
-            bin[0] |= (fp_get_sign(t->y) << 5);
-        } else {
-            fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y);
-        }
-        ep_free(t);
-    } RLC_CATCH_ANY {
-        RLC_THROW(ERR_CAUGHT);
-    }
-
-    bin[0] |= (G1_SERIALIZATION << 7);
- }
-
 void E1_copy(E1* res, const E1* p) {
     vec_copy(res, p, sizeof(E1));
 }
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 1fb9808edb6..9ab3e22545d 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -54,7 +54,6 @@ var frBytesLen = int(C.get_Fr_BYTES())
 // context required for the BLS set-up
 type ctx struct {
 	relicCtx *C.ctx_t
-	precCtx  *C.prec_st
 }
 
 // get some constants from the C layer
@@ -88,7 +87,6 @@ func (ct *ctx) initContext() error {
 		return errors.New("Relic core init failed")
 	}
 	ct.relicCtx = c
-	ct.precCtx = C.init_precomputed_data_BLS12_381()
 	return nil
 }
 
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 61a6af53069..23a28299722 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -42,17 +42,9 @@
 #define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
 
-// Structure of precomputed data
-typedef struct prec_ {
-    // other field-related constants
-    bn_st p_1div2;
-    fp_t r;   // Montgomery multiplication constant
-} prec_st;
-
 // TODO: to delete when Relic is removed
-bn_st* Fr_blst_to_relic(const Fr* x);
-Fr*  Fr_relic_to_blst(const bn_st* x);
-ep2_st* E2_blst_to_relic(const E2* x);
+ep2_st*     E2_blst_to_relic(const E2* x);
+ep_st*      E1_blst_to_relic(const E1* x);
 
 int      get_valid();
 int      get_invalid();
@@ -107,10 +99,6 @@ int         E1_sum_vector_byte(byte*, const byte*, const int);
 void        G1_mult_gen(E1*, const Fr*);
 BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
-
-ep_st*      E1_blst_to_relic(const E1* x);
-int         ep_read_bin_compact(ep_t, const byte *, const int);
-void        ep_write_bin_compact(byte *, const ep_t,  const int);
 void        unsecure_map_bytes_to_G1(E1*, const byte*, int);
 BLST_ERROR  unsecure_map_bytes_to_G1complement(E1*, const byte*, int);
 
@@ -135,8 +123,6 @@ BLST_ERROR  unsecure_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
-prec_st* init_precomputed_data_BLS12_381();
-void     precomputed_data_set(const prec_st* p);
 
 // utility testing function
 void xmd_sha256(byte *, int, byte *, int, byte *, int);

From 3fd05397c559270d910c8cdbcf22b0974ebae5c7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 17:35:12 -0600
Subject: [PATCH 092/200] add read/write tests for G1 points

---
 crypto/bls12381_utils.c       | 10 ++++++++--
 crypto/bls12381_utils.go      |  7 ++++++-
 crypto/bls12381_utils.h       |  3 ++-
 crypto/bls12381_utils_test.go | 36 +++++++++++++++++++++++++++++++++++
 crypto/bls_crossBLST_test.go  |  3 +--
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index c2058c4148d..cd19f3ad5fa 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -597,6 +597,12 @@ void E1_copy(E1* res, const E1* p) {
     vec_copy(res, p, sizeof(E1));
 }
 
+// checks p1 == p2
+bool_t E1_is_equal(const E1* p1, const E1* p2) {
+    // `POINTonE1_is_equal` includes the infinity case
+    return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2);
+}
+
 // compare p to infinity
 bool_t E1_is_infty(const E1* p) {
     // BLST infinity points are defined by Z=0
@@ -1258,7 +1264,7 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
 // `len` should be at least Fr_BYTES.
 // this is a testing file only, should not be used in any protocol!
 void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
-    assert(len > Fr_BYTES);
+    assert(len >= Fr_BYTES);
     // map to Fr
     Fr log;
     map_bytes_to_Fr(&log, bytes, len);
@@ -1296,7 +1302,7 @@ BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len)
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
 void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
-    assert(len > Fr_BYTES);
+    assert(len >= Fr_BYTES);
     // map to Fr
     Fr log;
     map_bytes_to_Fr(&log, bytes, len);
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 9ab3e22545d..bc1b9dc064a 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -111,7 +111,12 @@ func (x *scalar) equals(other *scalar) bool {
 	return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0
 }
 
-// comparison in G2
+// comparison in E1
+func (p *pointE1) equals(other *pointE1) bool {
+	return C.E1_is_equal((*C.E1)(p), (*C.E1)(other)) != 0
+}
+
+// comparison in E2
 func (p *pointE2) equals(other *pointE2) bool {
 	return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0
 }
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 23a28299722..5c6aab8313d 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -87,6 +87,7 @@ void        Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
 void        E1_copy(E1*, const E1*);
+bool_t      E1_is_equal(const E1*, const E1*);
 void        E1_set_infty(E1*);
 bool_t      E1_is_infty(const E1*);
 void        E1_to_affine(E1*, const E1*);
@@ -106,7 +107,7 @@ BLST_ERROR  unsecure_map_bytes_to_G1complement(E1*, const byte*, int);
 void        E2_set_infty(E2* p);
 bool_t      E2_is_infty(const E2*);
 bool_t      E2_affine_on_curve(const E2*);
-bool_t      E2_is_equal(const E2* p1, const E2* p2);
+bool_t      E2_is_equal(const E2*, const E2*);
 void        E2_copy(E2*, const E2*);
 void        E2_to_affine(E2*, const E2*);
 BLST_ERROR  E2_read_bytes(E2*, const byte *,  const int); 
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index e5207d9f68a..10db3d57714 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -157,6 +157,42 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 	})
 }
 
+// specific test of G1 points Encode and decode (BLS signature since the library is set for min_sig).
+// G2 points read and write are implicitly tested by public keys Encode/Decode.
+func TestReadWriteG1(t *testing.T) {
+	prg := getPRG(t)
+	seed := make([]byte, frBytesLen)
+	bytes := make([]byte, SignatureLenBLSBLS12381)
+	// generate a random G1 point, encode it, decode it,
+	// and compare it the original point
+	iterations := 50
+	t.Run("random points", func(t *testing.T) {
+		for i := 0; i < iterations; i++ {
+			var p, q pointE1
+			_, err := prg.Read(seed)
+			unsecureMapToG1(&p, seed)
+			require.NoError(t, err)
+			writePointE1(bytes, &p)
+			err = readPointE1(&q, bytes)
+			require.NoError(t, err)
+			assert.True(t, p.equals(&q))
+		}
+	})
+
+	t.Run("infinity", func(t *testing.T) {
+		for i := 0; i < iterations; i++ {
+			var p, q pointE1
+			seed := make([]byte, frBytesLen)
+			unsecureMapToG1(&p, seed) // this results in the infinity point
+			writePointE1(bytes, &p)
+			require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check
+			err := readPointE1(&q, bytes)
+			require.NoError(t, err)
+			assert.True(t, p.equals(&q))
+		}
+	})
+}
+
 // test some edge cases of MapToFr to validate modular reduction and endianness:
 //   - inputs `0` and curve order `r`
 //   - inputs `1` and `r+1`
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index e4e957ea495..ffdb156e251 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -160,7 +160,6 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 		sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381)
 		writePointE1(sigFlowOutBytes, &pointFlow)
 		sigBLSTOutBytes := pointBLST.Compress()
-
 		assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes)
 	}
 }
@@ -217,6 +216,6 @@ func TestCrossBLST(t *testing.T) {
 	rapid.Check(t, testKeyGenCrossBLST)
 	rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST)
 	rapid.Check(t, testEncodeDecodePublicKeyCrossBLST)
-	rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented
+	rapid.Check(t, testEncodeDecodeG1CrossBLST)
 	rapid.Check(t, testSignHashCrossBLST)
 }

From 29f748910d580d87823123fed95ed06abdee45cb Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 18:31:10 -0600
Subject: [PATCH 093/200] rename insecure to unsafe

---
 crypto/bls12381_utils.c       |  8 ++++----
 crypto/bls12381_utils.go      | 22 +++++++++++-----------
 crypto/bls12381_utils.h       |  8 ++++----
 crypto/bls12381_utils_test.go | 16 ++++++++--------
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index cd19f3ad5fa..ca92f4f85a2 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1263,7 +1263,7 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
 // maps the bytes to a point in G1.
 // `len` should be at least Fr_BYTES.
 // this is a testing file only, should not be used in any protocol!
-void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
+void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
     assert(len >= Fr_BYTES);
     // map to Fr
     Fr log;
@@ -1274,7 +1274,7 @@ void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
 
 // generates a point in E1\G1 and stores it in p
 // this is a testing file only, should not be used in any protocol!
-BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
+BLST_ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
     assert(G1_SERIALIZATION == COMPRESSED);
     assert(len >= G1_SER_BYTES);
 
@@ -1301,7 +1301,7 @@ BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len)
 // maps the bytes to a point in G2.
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
-void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
+void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
     assert(len >= Fr_BYTES);
     // map to Fr
     Fr log;
@@ -1315,7 +1315,7 @@ void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
 // succeeds.
 // For now, function only works when E2 serialization is compressed.
 // this is a testing tool only, it should not be used in any protocol!
-BLST_ERROR unsecure_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
+BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     assert(G2_SERIALIZATION == COMPRESSED);
     assert(len >= G2_SER_BYTES);
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index bc1b9dc064a..e83359263ea 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -273,28 +273,28 @@ func mapToG1(data []byte) *pointE1 {
 // mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It maps input bytes to a point in G2 and stores it in input point.
 // THIS IS NOT the kind of mapping function that is used in BLS signature.
-func unsecureMapToG1(pt *pointE1, seed []byte) {
-	C.unsecure_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
+func unsafeMapToG1(pt *pointE1, seed []byte) {
+	C.unsafe_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
-// unsecureMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
+// unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func unsecureMapToG1Complement(pt *pointE1, seed []byte) bool {
-	res := C.unsecure_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
+func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool {
+	res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 	return int(res) == blst_valid
 }
 
-// unsecureMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files.
+// unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It maps input bytes to a point in G2 and stores it in input point.
 // THIS IS NOT the kind of mapping function that is used in BLS signature.
-func unsecureMapToG2(pt *pointE2, seed []byte) {
-	C.unsecure_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
+func unsafeMapToG2(pt *pointE2, seed []byte) {
+	C.unsafe_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
-// unsecureMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
+// unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func unsecureMapToG2Complement(pt *pointE2, seed []byte) bool {
-	res := C.unsecure_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
+func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool {
+	res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 	return int(res) == blst_valid
 }
 
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 5c6aab8313d..1e4413b914c 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -100,8 +100,8 @@ int         E1_sum_vector_byte(byte*, const byte*, const int);
 void        G1_mult_gen(E1*, const Fr*);
 BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
-void        unsecure_map_bytes_to_G1(E1*, const byte*, int);
-BLST_ERROR  unsecure_map_bytes_to_G1complement(E1*, const byte*, int);
+void        unsafe_map_bytes_to_G1(E1*, const byte*, int);
+BLST_ERROR  unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
@@ -119,8 +119,8 @@ void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
 bool_t      E2_in_G2(const E2*);
-void        unsecure_map_bytes_to_G2(E2*, const byte*, int);
-BLST_ERROR  unsecure_map_bytes_to_G2complement(E2*, const byte*, int);
+void        unsafe_map_bytes_to_G2(E2*, const byte*, int);
+BLST_ERROR  unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 10db3d57714..ae1b240d8ae 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -103,28 +103,28 @@ func TestSubgroupCheck(t *testing.T) {
 
 	t.Run("G1", func(t *testing.T) {
 		var p pointE1
-		unsecureMapToG1(&p, seed) // point in G1
+		unsafeMapToG1(&p, seed) // point in G1
 		assert.True(t, checkMembershipG1(&p))
 
 		inG1 := false
 		for !inG1 {
 			_, err := prg.Read(seed)
 			require.NoError(t, err)
-			inG1 = unsecureMapToG1Complement(&p, seed) // point in E2\G2
+			inG1 = unsafeMapToG1Complement(&p, seed) // point in E2\G2
 		}
 		assert.False(t, checkMembershipG1(&p))
 	})
 
 	t.Run("G2", func(t *testing.T) {
 		var p pointE2
-		unsecureMapToG2(&p, seed) // point in G2
+		unsafeMapToG2(&p, seed) // point in G2
 		assert.True(t, checkMembershipG2(&p))
 
 		inG2 := false
 		for !inG2 {
 			_, err := prg.Read(seed)
 			require.NoError(t, err)
-			inG2 = unsecureMapToG2Complement(&p, seed) // point in E2\G2
+			inG2 = unsafeMapToG2Complement(&p, seed) // point in E2\G2
 		}
 		assert.False(t, checkMembershipG2(&p))
 	})
@@ -138,7 +138,7 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 
 	b.Run("G1", func(b *testing.B) {
 		var p pointE1
-		unsecureMapToG1(&p, seed) // point in G1
+		unsafeMapToG1(&p, seed) // point in G1
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG1(&p) // G1
@@ -148,7 +148,7 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 
 	b.Run("G2", func(b *testing.B) {
 		var p pointE2
-		unsecureMapToG2(&p, seed) // point in G2
+		unsafeMapToG2(&p, seed) // point in G2
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG2(&p) // G2
@@ -170,7 +170,7 @@ func TestReadWriteG1(t *testing.T) {
 		for i := 0; i < iterations; i++ {
 			var p, q pointE1
 			_, err := prg.Read(seed)
-			unsecureMapToG1(&p, seed)
+			unsafeMapToG1(&p, seed)
 			require.NoError(t, err)
 			writePointE1(bytes, &p)
 			err = readPointE1(&q, bytes)
@@ -183,7 +183,7 @@ func TestReadWriteG1(t *testing.T) {
 		for i := 0; i < iterations; i++ {
 			var p, q pointE1
 			seed := make([]byte, frBytesLen)
-			unsecureMapToG1(&p, seed) // this results in the infinity point
+			unsafeMapToG1(&p, seed) // this results in the infinity point
 			writePointE1(bytes, &p)
 			require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check
 			err := readPointE1(&q, bytes)

From c47c3211e60da57086b7a0ee46d549834d10071b Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 15 May 2023 20:20:03 -0600
Subject: [PATCH 094/200] fix node info comparison bug in test

---
 model/bootstrap/node_info.go      | 12 ++++++++++++
 model/bootstrap/node_info_test.go |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/model/bootstrap/node_info.go b/model/bootstrap/node_info.go
index cdc6f855c4a..62a33f6f442 100644
--- a/model/bootstrap/node_info.go
+++ b/model/bootstrap/node_info.go
@@ -174,6 +174,18 @@ type decodableNodeInfoPub struct {
 	Stake uint64
 }
 
+func (info *NodeInfoPub) Equals(other *NodeInfoPub) bool {
+	if other == nil {
+		return false
+	}
+	return info.Address == other.Address &&
+		info.NodeID == other.NodeID &&
+		info.Role == other.Role &&
+		info.Weight == other.Weight &&
+		info.NetworkPubKey.PublicKey.Equals(other.NetworkPubKey.PublicKey) &&
+		info.StakingPubKey.PublicKey.Equals(other.StakingPubKey.PublicKey)
+}
+
 func (info *NodeInfoPub) UnmarshalJSON(b []byte) error {
 	var decodable decodableNodeInfoPub
 	err := json.Unmarshal(b, &decodable)
diff --git a/model/bootstrap/node_info_test.go b/model/bootstrap/node_info_test.go
index 536c0c808f9..39294de5f69 100644
--- a/model/bootstrap/node_info_test.go
+++ b/model/bootstrap/node_info_test.go
@@ -50,7 +50,7 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) {
 		var dec bootstrap.NodeInfoPub
 		err = json.Unmarshal(enc, &dec)
 		require.NoError(t, err)
-		assert.Equal(t, conf, dec)
+		assert.True(t, dec.Equals(&conf))
 	})
 	t.Run("compat: should accept old files using Stake field", func(t *testing.T) {
 		conf := unittest.NodeInfoFixture().Public()
@@ -61,6 +61,6 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) {
 		var dec bootstrap.NodeInfoPub
 		err = json.Unmarshal(enc, &dec)
 		require.NoError(t, err)
-		assert.Equal(t, conf, dec)
+		assert.True(t, dec.Equals(&conf))
 	})
 }

From 184a49d110dbc830af1c05d8a3a6d85dcfaab148 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 16 May 2023 11:33:53 -0600
Subject: [PATCH 095/200] fix public key comparison bugs in tests

---
 engine/access/access_test.go | 3 ---
 model/encodable/keys_test.go | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/engine/access/access_test.go b/engine/access/access_test.go
index a2af4f64481..768cc9b0ee2 100644
--- a/engine/access/access_test.go
+++ b/engine/access/access_test.go
@@ -562,15 +562,12 @@ func (suite *Suite) TestGetExecutionResultByBlockID() {
 			for i, serviceEvent := range executionResult.ServiceEvents {
 				assert.Equal(suite.T(), serviceEvent.Type.String(), er.ServiceEvents[i].Type)
 				event := serviceEvent.Event
-
 				marshalledEvent, err := json.Marshal(event)
 				require.NoError(suite.T(), err)
-
 				assert.Equal(suite.T(), marshalledEvent, er.ServiceEvents[i].Payload)
 			}
 			parsedExecResult, err := convert.MessageToExecutionResult(resp.ExecutionResult)
 			require.NoError(suite.T(), err)
-			assert.Equal(suite.T(), parsedExecResult, executionResult)
 			assert.Equal(suite.T(), parsedExecResult.ID(), executionResult.ID())
 		}
 
diff --git a/model/encodable/keys_test.go b/model/encodable/keys_test.go
index ccdf63cd044..338c1708366 100644
--- a/model/encodable/keys_test.go
+++ b/model/encodable/keys_test.go
@@ -247,7 +247,7 @@ func TestEncodableRandomBeaconPrivKeyMsgPack(t *testing.T) {
 	err = key.UnmarshalMsgpack(b)
 	require.NoError(t, err)
 
-	require.Equal(t, oldPubKey, key.PublicKey)
+	require.True(t, oldPubKey.Equals(key.PublicKey))
 }
 
 func generateRandomSeed(t *testing.T) []byte {

From a4fb4357656dd6fbbc5e101cda001471a3445c81 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 16 May 2023 12:28:08 -0600
Subject: [PATCH 096/200] yet another key comparison bug in tests

---
 model/flow/identity.go      | 13 +++++++++++++
 model/flow/identity_test.go |  8 ++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/model/flow/identity.go b/model/flow/identity.go
index f05188988e6..eb86279641b 100644
--- a/model/flow/identity.go
+++ b/model/flow/identity.go
@@ -61,6 +61,19 @@ type Identity struct {
 	NetworkPubKey crypto.PublicKey
 }
 
+func (id *Identity) Equals(other *Identity) bool {
+	if other == nil {
+		return false
+	}
+	return id.NodeID == other.NodeID &&
+		id.Address == other.Address &&
+		id.Role == other.Role &&
+		id.Weight == other.Weight &&
+		id.Ejected == other.Ejected &&
+		id.StakingPubKey.Equals(other.StakingPubKey) &&
+		id.NetworkPubKey.Equals(other.NetworkPubKey)
+}
+
 // ParseIdentity parses a string representation of an identity.
 func ParseIdentity(identity string) (*Identity, error) {
 
diff --git a/model/flow/identity_test.go b/model/flow/identity_test.go
index 9c1a137d8ab..0f3b2c2145a 100644
--- a/model/flow/identity_test.go
+++ b/model/flow/identity_test.go
@@ -60,7 +60,7 @@ func TestIdentityEncodingJSON(t *testing.T) {
 		var dec flow.Identity
 		err = json.Unmarshal(enc, &dec)
 		require.NoError(t, err)
-		require.Equal(t, identity, &dec)
+		require.True(t, identity.Equals(&dec))
 	})
 
 	t.Run("empty address should be omitted", func(t *testing.T) {
@@ -73,7 +73,7 @@ func TestIdentityEncodingJSON(t *testing.T) {
 		var dec flow.Identity
 		err = json.Unmarshal(enc, &dec)
 		require.NoError(t, err)
-		require.Equal(t, identity, &dec)
+		require.True(t, identity.Equals(&dec))
 	})
 
 	t.Run("compat: should accept old files using Stake field", func(t *testing.T) {
@@ -85,7 +85,7 @@ func TestIdentityEncodingJSON(t *testing.T) {
 		var dec flow.Identity
 		err = json.Unmarshal(enc, &dec)
 		require.NoError(t, err)
-		require.Equal(t, identity, &dec)
+		require.True(t, identity.Equals(&dec))
 	})
 }
 
@@ -96,7 +96,7 @@ func TestIdentityEncodingMsgpack(t *testing.T) {
 	var dec flow.Identity
 	err = msgpack.Unmarshal(enc, &dec)
 	require.NoError(t, err)
-	require.Equal(t, identity, &dec)
+	require.True(t, identity.Equals(&dec))
 }
 
 func TestIdentityList_Exists(t *testing.T) {

From ff5a0c7d1ec3fb682836056b218f4d0214e6587e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 16 May 2023 14:11:21 -0600
Subject: [PATCH 097/200] another bug: compare ER bassed on IDs and not the
 in-mem  struct

---
 state/protocol/badger/snapshot_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/state/protocol/badger/snapshot_test.go b/state/protocol/badger/snapshot_test.go
index 93c72cbeb9e..01c50b94336 100644
--- a/state/protocol/badger/snapshot_test.go
+++ b/state/protocol/badger/snapshot_test.go
@@ -832,7 +832,7 @@ func TestLatestSealedResult(t *testing.T) {
 			expectedResult, expectedSeal, err := rootSnapshot.SealedResult()
 			require.NoError(t, err)
 
-			assert.Equal(t, expectedResult, gotResult)
+			assert.Equal(t, expectedResult.ID(), gotResult.ID())
 			assert.Equal(t, expectedSeal, gotSeal)
 		})
 	})

From ee91d4f14e512a5303e71b8bdd822b515c519f1c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 16:07:46 -0600
Subject: [PATCH 098/200] use BLST multi_pairing to verify BLS signature with
 many messages

---
 crypto/bls_core.c  | 106 +++++++++++++--------------------------------
 crypto/bls_test.go |   2 +-
 2 files changed, 32 insertions(+), 76 deletions(-)

diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 30f25419aec..4c73e1131b2 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -89,75 +89,53 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 
     int ret = UNDEFINED; // return value
     
-    ep_t* elemsG1 = (ep_t*)malloc((nb_hashes + 1) * sizeof(ep_t));
+    E1* elemsG1 = (E1*)malloc((nb_hashes + 1) * sizeof(E1));
     if (!elemsG1) goto outG1;
-    ep2_t* elemsG2 = (ep2_t*)malloc((nb_hashes + 1) * sizeof(ep2_t));
+    E2* elemsG2 = (E2*)malloc((nb_hashes + 1) * sizeof(E2));
     if (!elemsG2) goto outG2;
 
-    for (int i=0; i < nb_hashes+1; i++) {
-        ep_new(elemsG1[i]);
-        ep2_new(elemsG2[i]);
-    }
-
     // elemsG1[0] = sig
-    E1 s;
-    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) {
         ret = INVALID;
         goto out;
     }
 
-    // check s is in G1
-    if (!E1_in_G1(&s)) goto out;
-    ep_st* s_tmp = E1_blst_to_relic(&s);
-    ep_copy(elemsG1[0], s_tmp);
+    // check signature is in G1
+    if (!E1_in_G1(&elemsG1[0])) {
+        ret = INVALID;
+        goto out;
+    }
 
     // elemsG2[0] = -g2
-    ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded 
+    E2_copy(&elemsG2[0], BLS12_381_minus_g2); 
 
     // map all hashes to G1
     int offset = 0;
     for (int i=1; i < nb_hashes+1; i++) {
         // elemsG1[i] = h
         // hash to G1 
-        E1 h;
-        map_to_G1(&h, &hashes[offset], len_hashes[i-1]);
-        ep_st* h_tmp = (ep_st*) E1_blst_to_relic(&h);
-        ep_copy(elemsG1[i], h_tmp); 
+        map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i-1]);
         offset += len_hashes[i-1];
     }
 
     // aggregate public keys mapping to the same hash
     offset = 0;
-    E2 tmp;
     for (int i=1; i < nb_hashes+1; i++) {
         // elemsG2[i] = agg_pk[i]
-        E2_sum_vector(&tmp, &pks[offset] , pks_per_hash[i-1]);
-        ep2_st* relic_tmp = E2_blst_to_relic(&tmp);
-        ep2_copy(elemsG2[i], relic_tmp);
-        free(relic_tmp);
+        E2_sum_vector(&elemsG2[i], &pks[offset] , pks_per_hash[i-1]);
         offset += pks_per_hash[i-1];
     }
 
-    fp12_t pair;
-    fp12_new(&pair);
-    // double pairing with Optimal Ate 
-    pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), nb_hashes+1);
-
-    // compare the result to 1
-    int cmp_res = fp12_cmp_dig(pair, 1);
-    
-    if (core_get()->code == RLC_OK) {
-        if (cmp_res == RLC_EQ) ret = VALID;
-        else ret = INVALID;
+    // multi pairing
+    Fp12 e;
+    multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1);
+    if (Fp12_is_one(&e)) {
+        ret = VALID;
     } else {
-        ret = UNDEFINED;
+        ret = INVALID;
     }
 
 out:
-    for (int i=0; i < nb_hashes+1; i++) {
-        ep_free(elemsG1[i]);
-        ep2_free(elemsG2[i]);
-    }
     free(elemsG2);
 outG2:
     free(elemsG1);
@@ -185,38 +163,29 @@ int bls_verifyPerDistinctKey(const byte* sig,
 
     int ret = UNDEFINED; // return value
     
-    ep_t* elemsG1 = (ep_t*)malloc((nb_pks + 1) * sizeof(ep_t));
+    E1* elemsG1 = (E1*)malloc((nb_pks + 1) * sizeof(E1));
     if (!elemsG1) goto outG1;
-    ep2_t* elemsG2 = (ep2_t*)malloc((nb_pks + 1) * sizeof(ep2_t));
+    E2* elemsG2 = (E2*)malloc((nb_pks + 1) * sizeof(E2));
     if (!elemsG2) goto outG2;
-    for (int i=0; i < nb_pks+1; i++) {
-        ep_new(elemsG1[i]);
-        ep2_new(elemsG2[i]);
-    }
 
     // elemsG1[0] = s
-    E1 s;
-    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) {
         ret = INVALID;
         goto out;
     }
 
     // check s in G1
-    if (!E1_in_G1(&s)){
+    if (!E1_in_G1(&elemsG1[0])){
         ret = INVALID;
         goto out;
-    } 
-    ep_st* s_tmp = E1_blst_to_relic(&s);
-    ep_copy(elemsG1[0], s_tmp);
+    }
 
     // elemsG2[0] = -g2
-    ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded 
+    E2_copy(&elemsG2[0], BLS12_381_minus_g2);
 
     // set the public keys
     for (int i=1; i < nb_pks+1; i++) {
-        ep2_st* tmp = E2_blst_to_relic(&pks[i-1]);
-        ep2_copy(elemsG2[i], tmp);
-        free(tmp);
+        E2_copy(&elemsG2[i], &pks[i-1]);
     }
 
     // map all hashes to G1 and aggregate the ones with the same public key
@@ -246,34 +215,21 @@ int bls_verifyPerDistinctKey(const byte* sig,
             index_offset++; 
         }
         // aggregate all the points of the array 
-        E1 sum;
-        E1_sum_vector(&sum, tmp_hashes, hashes_per_pk[i-1]);
-        ep_st* sum_tmp = E1_blst_to_relic(&sum);
-        ep_copy(elemsG1[i], sum_tmp);
+        E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i-1]);
     }
-    for (int i=0; i<tmp_hashes_size; i++) ep_free(&tmp_hashes[i]);
     free(tmp_hashes);
 
-    fp12_t pair;
-    fp12_new(&pair);
-    // double pairing with Optimal Ate 
-    pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), nb_pks+1);
-
-    // compare the result to 1
-    int cmp_res = fp12_cmp_dig(pair, 1);
+    // multi pairing
+    Fp12 e;
+    multi_pairing(&e, elemsG1, elemsG2, nb_pks+1);
     
-    if (core_get()->code == RLC_OK) {
-        if (cmp_res == RLC_EQ) ret = VALID;
-        else ret = INVALID;
+    if (Fp12_is_one(&e)) {
+        ret = VALID;
     } else {
-        ret = UNDEFINED;
+        ret = INVALID;
     }
 
 out:
-    for (int i=0; i < nb_pks+1; i++) {
-        ep_free(elemsG1[i]);
-        ep2_free(elemsG2[i]);
-    }
     free(elemsG2);
 outG2:
     free(elemsG1);
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 7de6af89325..613e68c07a5 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -900,7 +900,7 @@ func BenchmarkBatchVerify(b *testing.B) {
 //
 // Aggregate n signatures of distinct messages under different keys,
 // and verify the aggregated signature using the multi-signature verification with
-// many message.
+// many messages.
 func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 	rand := getPRG(t)
 	// number of signatures to aggregate

From e84403d6489c3d60eff61c986ebf9df00c336068 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 16:47:48 -0600
Subject: [PATCH 099/200] add test calling multi_pairing with length covering
 many values mod N_MAX

---
 crypto/bls12381_utils.c | 13 ------------
 crypto/bls12381_utils.h |  1 -
 crypto/bls_test.go      | 46 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 4dbd09a5886..2b51739cb81 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1445,19 +1445,6 @@ void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) {
     final_exp(res_vec, res_vec);
 }
 
-// TODO: remove
-void test_pairing(const E1* h, const E1* s, const E2* pk) {
-    Fp12 e1, e2, e3;
-    e(&e1, h, pk);
-    Fp12_print_("e1", &e1);
-    e(&e2, s, BLS12_381_minus_g2);
-    Fp12_print_("e2", &e2);
-    Fp12_mult(&e3, &e2, &e1);
-    Fp12_print_("e3", &e3);
-}
-
-
-
 // This is a testing function.
 // It wraps a call to a Relic macro since cgo can't call macros.
 void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 2242e38edfe..c2b7c664cbd 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -130,7 +130,6 @@ void        Fp12_set_one(Fp12*);
 void        Fp12_inv(Fp12*); // TODO: remove
 void        Fp12_mult(Fp12*, const Fp12*, const Fp12*); // TODO: remove
 void        multi_pairing(Fp12*, const E1*, const E2*, const int);
-void        test_pairing(const E1*, const E1*, const E2*); // TODO: remove
 
 // Utility functions
 ctx_t*   relic_init_BLS12_381();
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 613e68c07a5..c3abbcfb673 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -904,10 +904,10 @@ func BenchmarkBatchVerify(b *testing.B) {
 func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 	rand := getPRG(t)
 	// number of signatures to aggregate
-	sigsNum := rand.Intn(20) + 1
+	sigsNum := rand.Intn(40) + 1
 	sigs := make([]Signature, 0, sigsNum)
 
-	// number of keys
+	// number of keys (less than the number of signatures)
 	keysNum := rand.Intn(sigsNum) + 1
 	sks := make([]PrivateKey, 0, keysNum)
 	// generate the keys
@@ -983,8 +983,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsBLSAggregateEmptyListError(err))
-		assert.False(t, valid,
-			"verification should fail with an empty key list")
+		assert.False(t, valid, "verification should fail with an empty key list")
 	})
 
 	// test inconsistent input arrays
@@ -1019,6 +1018,45 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		assert.False(t, valid, "verification should fail with nil hasher")
 		inputPks[0] = tmpPK
 	})
+
+	t.Run("variable number of distinct keys and messages", func(t *testing.T) {
+		// use a specific PRG for easier reproduction
+		prg := getPRG(t)
+		// number of signatures to aggregate
+		N := 100
+		sigs := make([]Signature, 0, N)
+		msgs := make([][]byte, 0, N)
+		pks := make([]PublicKey, 0, N)
+		kmacs := make([]hash.Hasher, 0, N)
+		kmac := NewExpandMsgXOFKMAC128("test tag")
+		for i := 0; i < N; i++ {
+			// distinct message
+			msg := make([]byte, 20)
+			msgs = append(msgs, msg)
+			_, err := prg.Read(msg)
+			require.NoError(t, err)
+			// distinct key
+			sk := randomSK(t, prg)
+			pks = append(pks, sk.PublicKey())
+			// generate a signature
+			s, err := sk.Sign(msg, kmac)
+			require.NoError(t, err)
+			sigs = append(sigs, s)
+			kmacs = append(kmacs, kmac)
+		}
+
+		// go through all numbers of couples (msg, key)
+		for i := 1; i < N; i++ {
+			// aggregate signatures
+			var err error
+			aggSig, err = AggregateBLSSignatures(sigs[:i])
+			require.NoError(t, err)
+			// Verify the aggregated signature
+			valid, err := VerifyBLSSignatureManyMessages(pks[:i], aggSig, msgs[:i], kmacs[:i])
+			require.NoError(t, err, "verification errored with %d couples (msg,key)", i)
+			assert.True(t, valid, "verification failed with %d couples (msg,key)", i)
+		}
+	})
 }
 
 // TestBLSErrorTypes verifies working of error-type-detecting functions

From 9167d9e3604f0c4e3f13fc300e588b45de774c7d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 16:57:18 -0600
Subject: [PATCH 100/200] use BLST multi_pairing to verify BLS SPoCK

---
 crypto/bls12381_utils.c | 76 +++++++++++------------------------------
 1 file changed, 19 insertions(+), 57 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 2b51739cb81..05124d81092 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1189,79 +1189,41 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){
 // the membership check in G2 is separated to allow optimizing multiple verifications 
 // using the same public keys.
 int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
-    ep_t elemsG1[2];
-    ep2_t elemsG2[2];
-    ep_new(elemsG1[0]);
-    ep_new(elemsG1[1]);
-    ep2_new(elemsG2[1]);
-    ep2_new(elemsG2[0]);
-    int ret = UNDEFINED;
+    E1 elemsG1[2];
+    E2 elemsG2[2];
 
     // elemsG1[0] = s1
-    E1 s;
-    if (E1_read_bytes(&s, sig1, SIGNATURE_LEN) != BLST_SUCCESS) {
-        ret = INVALID;
-        goto out;
+    if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != BLST_SUCCESS) {
+        return INVALID;
     };
     // check s1 is in G1
-    if (!E1_in_G1(&s))  {
-        ret = INVALID;
-        goto out;
+    if (!E1_in_G1(&elemsG1[0]))  {
+        return INVALID;
     }
-    ep_st* s_tmp = E1_blst_to_relic(&s);
-    ep_copy(elemsG1[0], s_tmp);
 
     // elemsG1[1] = s2
-    if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) {
-        ret = INVALID;
-        goto out;
+    if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != BLST_SUCCESS) {
+        return INVALID;
     };
     // check s2 is in G1
-    if (!E1_in_G1(&s))  {
-        ret = INVALID;
-        goto out;
+    if (!E1_in_G1(&elemsG1[1]))  {
+        return INVALID;
     }
-    s_tmp = E1_blst_to_relic(&s);
-    ep_copy(elemsG1[1], s_tmp); 
 
     // elemsG2[1] = pk1
-    ep2_st* pk_tmp = E2_blst_to_relic(pk1);
-    ep2_copy(elemsG2[1], pk_tmp);
-
-    // elemsG2[0] = pk2
-    pk_tmp = E2_blst_to_relic(pk2);
-    ep2_copy(elemsG2[0], pk_tmp);
-    free(pk_tmp);
-    free(s_tmp);
+    E2_copy(&elemsG2[1], pk1);
 
     // elemsG2[0] = -pk2
-    ep2_neg(elemsG2[0], elemsG2[0]);
+    E2_neg(&elemsG2[0], pk2);
 
-    fp12_t pair;
-    fp12_new(&pair);
-    // double pairing with Optimal Ate 
-    pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2);
+    // double pairing
+    Fp12 e;
+    multi_pairing(&e, elemsG1 , elemsG2, 2);
 
-    // compare the result to 1
-    int res = fp12_cmp_dig(pair, 1);
-    fp12_free(pair);
-
-    if (core_get()->code == RLC_OK) {
-        if (res == RLC_EQ) { 
-            ret = VALID; 
-        }
-        else { 
-            ret = INVALID; 
-        }
-        goto out; 
-    }
-
-out:
-    ep_free(elemsG1[0]);
-    ep_free(elemsG1[1]);
-    ep2_free(elemsG2[0]);
-    ep2_free(elemsG2[1]);
-    return ret;
+    if (Fp12_is_one(&e)) {
+        return VALID; 
+    } 
+    return INVALID; 
 }
 
 // Subtracts all G2 array elements `y` from an element `x` and writes the 

From cf0aa6b84396c56cdfce5764fa60ba4849639a37 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 17:16:56 -0600
Subject: [PATCH 101/200] clean up Fp12 tools

---
 crypto/bls12381_utils.c | 14 --------------
 crypto/bls12381_utils.h |  2 --
 2 files changed, 16 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 05124d81092..34a84fb629b 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1317,25 +1317,11 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
 // ------------------- Pairing utilities 
 
 bool_t Fp12_is_one(Fp12 *a) {
-    //return vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) &
-    //       vec_is_zero(a[0][1], sizeof(a) - sizeof(a[0][0]));
     return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12));
 }
 
 void Fp12_set_one(Fp12 *a) {
     vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12));
-    //vec_copy(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0]));
-    //vec_zero(a[0][1], sizeof(a) - sizeof(a[0][0]));
-}
-
-// TODO: remove
-void Fp12_inv(Fp12 *a) {
-    conjugate_fp12((vec384fp6*)a);
-}
-
-// TODO: remove
-void Fp12_mult(Fp12* ret, const Fp12* a, const Fp12* b){
-    mul_fp12((vec384fp6*)ret, (vec384fp6*)a, (vec384fp6*)b);
 }
 
 static void e(Fp12* res, const E1* p, const E2* q) {
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index c2b7c664cbd..c1914fbedfd 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -127,8 +127,6 @@ BLST_ERROR  unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
 // pairing and Fp12
 bool_t      Fp12_is_one(Fp12*);
 void        Fp12_set_one(Fp12*);
-void        Fp12_inv(Fp12*); // TODO: remove
-void        Fp12_mult(Fp12*, const Fp12*, const Fp12*); // TODO: remove
 void        multi_pairing(Fp12*, const E1*, const E2*, const int);
 
 // Utility functions

From 98152ba0649638b8198a10482422f5322c9ad54c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 17:29:46 -0600
Subject: [PATCH 102/200] clean up Relic tools

---
 crypto/bls.go            |   7 -
 crypto/bls12381_utils.c  | 271 +--------------------------------------
 crypto/bls12381_utils.go |  15 ---
 crypto/bls12381_utils.h  |  24 +---
 crypto/bls_core.c        |   2 -
 5 files changed, 9 insertions(+), 310 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 8cfd435b380..c4be5a3aa85 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -85,8 +85,6 @@ var expandMsgOutput = int(C.get_mapToG1_input_len())
 
 // blsBLS12381Algo, embeds SignAlgo
 type blsBLS12381Algo struct {
-	// points to Relic context of BLS12-381 with all the parameters
-	context ctx
 	// the signing algo and parameters
 	algo SigningAlgorithm
 }
@@ -535,11 +533,6 @@ var prKeyLengthBLSBLS12381 = int(C.get_sk_len())
 
 // init sets the context of BLS12-381 curve
 func (a *blsBLS12381Algo) init() error {
-	// initializes relic context and sets the B12_381 parameters
-	if err := a.context.initContext(); err != nil {
-		return err
-	}
-
 	// compare the Go and C layer constants as a sanity check
 	if signatureLengthBLSBLS12381 != SignatureLenBLSBLS12381 ||
 		pubKeyLengthBLSBLS12381 != PubKeyLenBLSBLS12381 ||
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 34a84fb629b..7055a7efa1b 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -8,6 +8,7 @@
 #include "bls_include.h"
 #include "assert.h"
 
+// compile all blst C src along with this file
 #include "blst_src.c"
 
 // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
@@ -29,33 +30,6 @@ int get_mapToG1_input_len() {
     return MAP_TO_G1_INPUT_LEN;
 }
 
-
-// Initializes Relic context with BLS12-381 parameters
-ctx_t* relic_init_BLS12_381() { 
-    // check Relic was compiled with the right conf 
-    assert(ALLOC == AUTO);
-
-    // sanity check of Relic constants the package is relying on
-    assert(RLC_OK == RLC_EQ);
-
-    // initialize relic core with a new context
-    ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t));
-    if (!bls_ctx) return NULL;
-    core_set(bls_ctx);
-    if (core_init() != RLC_OK) return NULL;
-
-    // init BLS curve
-    int ret = RLC_OK;
-    #if (FP_PRIME == 381)
-    ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config
-    #else
-    ep_param_set(B12_P381);
-    ep2_curve_set_twist(EP_MTYPE);  // Multiplicative twist 
-    #endif 
-    if (ret != RLC_OK) return NULL;
-    return core_get();
-}
-
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
@@ -397,44 +371,6 @@ void Fp_write_bytes(byte *bin, const Fp* a) {
     be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES);
 }
 
-// fp_read_bin_safe is a modified version of Relic's (void fp_read_bin).
-// It reads a field element from a buffer and makes sure the big number read can be 
-// written as a field element (is reduced modulo p). 
-// Unlike Relic's versions, the function does not reduce the read integer modulo p and does
-// not throw an exception for an integer larger than p. The function returns RLC_OK if the input
-// corresponds to a field element, and returns RLC_ERR otherwise. 
-static int fp_read_bin_safe(fp_t a, const byte *bin, int len) {
-    if (len != Fp_BYTES) {
-        return RLC_ERR;
-    }
-
-    int ret = RLC_ERR; 
-    bn_t t;
-    bn_new(t);
-    bn_read_bin(t, bin, Fp_BYTES);
-
-    // make sure read bn is reduced modulo p
-    // first check is sanity check, since current implementation of `bn_read_bin` insures
-    // output bn is positive
-    if (bn_sign(t) == RLC_NEG || bn_cmp(t, &core_get()->prime) != RLC_LT) {
-        goto out;
-    } 
-
-    if (bn_is_zero(t)) {
-        fp_zero(a);
-    } else {
-        if (t->used == 1) {
-            fp_prime_conv_dig(a, t->dp[0]);
-        } else {
-            fp_prime_conv(a, t);
-        }
-    }	
-    ret = RLC_OK;
-out:
-    bn_free(t);
-    return ret;
-}
-
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
@@ -524,81 +460,6 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) {
 
 // ------------------- E1 utilities
 
-// TODO: to delete, only used by temporary E2_blst_to_relic
-static int ep_read_bin_compact(ep_t a, const byte *bin, const int len) {
-    // check the length
-    const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1));
-    if (len!=G1_size) {
-        return RLC_ERR;
-    }
-
-    // check the compression bit
-    int compressed = bin[0] >> 7;
-    if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
-        return RLC_ERR;
-    } 
-
-    // check if the point is infinity
-    int is_infinity = bin[0] & (1<<6);
-    if (is_infinity) {
-        // check if the remaining bits are cleared
-        if (bin[0] & 0x3F) {
-            return RLC_ERR;
-        }
-        for (int i=1; i<G1_size-1; i++) {
-            if (bin[i]) {
-                return RLC_ERR;
-            } 
-        }
-		ep_set_infty(a);
-		return RLC_OK;
-	} 
-
-    // read the sign bit and check for consistency
-    int y_sign = (bin[0] >> 5) & 1;
-    if (y_sign && (!compressed)) {
-        return RLC_ERR;
-    } 
-
-	a->coord = BASIC;
-	fp_set_dig(a->z, 1);
-    // use a temporary buffer to mask the header bits and read a.x 
-    byte temp[Fp_BYTES];
-    memcpy(temp, bin, Fp_BYTES);
-    temp[0] &= 0x1F;
-    if (fp_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) {
-        return RLC_ERR;
-    }
-
-    if (G1_SERIALIZATION == UNCOMPRESSED) { 
-        if (fp_read_bin_safe(a->y, bin + Fp_BYTES, Fp_BYTES) != RLC_OK) {
-            return RLC_ERR;
-        }
-        // check read point is on curve
-        if (!ep_on_curve(a)) {
-            return RLC_ERR;
-        }
-        return RLC_OK;
-    }
-    fp_zero(a->y);
-    fp_set_bit(a->y, 0, y_sign);
-    if (ep_upk(a, a) == 1) {
-        // resulting point is guaranteed to be on curve
-        return RLC_OK;
-    }
-    return RLC_ERR;
-}
-
-// TODO: temp utility function to delete
-ep_st* E1_blst_to_relic(const E1* x) {
-    ep_st* out = (ep_st*)malloc(sizeof(ep_st)); 
-    byte* data = (byte*)malloc(G1_SER_BYTES);
-    E1_write_bytes(data, x);
-    ep_read_bin_compact(out, data, G1_SER_BYTES);
-    free(data);
-    return out;
-}
-
 void E1_copy(E1* res, const E1* p) {
     if ((uptr_t)p == (uptr_t)res) {
         return;
@@ -872,97 +733,6 @@ const E1* BLS12_381_g1 = (const E1*)&BLS12_381_G1; /// TODO:delete
 const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2;
 const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2;
 
-// TODO: to delete
-static int fp2_read_bin_safe(fp2_t a, const byte *bin, int len) {
-    if (len != Fp2_BYTES) {
-        return RLC_ERR;
-    }
-    if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) {
-        return RLC_ERR;
-    }
-    if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) {
-        return RLC_ERR;
-    }
-    return RLC_OK;
-}
-
-// TODO: to delete, only used by temporary E2_blst_to_relic
-static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) {
-    // check the length
-    const int G2size = (G2_BYTES/(G2_SERIALIZATION+1));
-    if (len!=G2size) {
-        return RLC_ERR;
-    }
-
-    // check the compression bit
-    int compressed = bin[0] >> 7;
-    if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
-        return RLC_ERR;
-    } 
-
-    // check if the point in infinity
-    int is_infinity = bin[0] & 0x40;
-    if (is_infinity) {
-        // the remaining bits need to be cleared
-        if (bin[0] & 0x3F) {
-            return RLC_ERR;
-        }
-        for (int i=1; i<G2size-1; i++) {
-            if (bin[i]) {
-                return RLC_ERR;
-            } 
-        }
-		ep2_set_infty(a);
-		return RLC_OK;
-	} 
-
-    // read the sign bit and check for consistency
-    int y_sign = (bin[0] >> 5) & 1;
-    if (y_sign && (!compressed)) {
-        return RLC_ERR;
-    } 
-    
-	a->coord = BASIC;
-    fp2_set_dig(a->z, 1);   // a.z
-    // use a temporary buffer to mask the header bits and read a.x
-    byte temp[Fp2_BYTES];
-    memcpy(temp, bin, Fp2_BYTES);
-    temp[0] &= 0x1F;        // clear the header bits
-    if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) {
-        return RLC_ERR;
-    }
-
-    if (G2_SERIALIZATION == UNCOMPRESSED) {
-        if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ 
-            return RLC_ERR;
-        }
-        // check read point is on curve
-        if (!ep2_on_curve(a)) {
-            return RLC_ERR;
-        }
-        return RLC_OK;
-    }
-    
-    fp2_zero(a->y);
-    fp_set_bit(a->y[0], 0, y_sign);
-    fp_zero(a->y[1]);
-    if (ep2_upk(a, a) == 1) {
-        // resulting point is guaranteed to be on curve
-        return RLC_OK;
-    }
-    return RLC_ERR;
-}
-
-// TODO: temp utility function to delete
-ep2_st* E2_blst_to_relic(const E2* x) {
-    ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); 
-    byte* data = (byte*)malloc(G2_SER_BYTES);
-    E2_write_bytes(data, x);
-    ep2_read_bin_compact(out, data, G2_SER_BYTES);
-    free(data);
-    return out;
-}
-
 // E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form.
 // The resulting point is guaranteed to be on curve E2 (no G2 check is included).
 //
@@ -1324,13 +1094,6 @@ void Fp12_set_one(Fp12 *a) {
     vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12));
 }
 
-static void e(Fp12* res, const E1* p, const E2* q) {
-    E1 p_aff; E1_to_affine(&p_aff, p); 
-    E2 q_aff; E2_to_affine(&q_aff, q);
-    miller_loop_n((vec384fp6*)res, (POINTonE2_affine*)&q_aff, (POINTonE1_affine*)&p_aff, 1);
-    final_exp((vec384fp6*)res, (vec384fp6*)res);
-}
-
 // computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) 
 // by optimizing a common final exponentiation for all pairings.
 // result is stored in `res`.
@@ -1401,8 +1164,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int
 
 
 // DEBUG printing functions 
-#define DEBUG 1
-#if DEBUG==1
+#if (DEBUG == 1)
 void bytes_print_(char* s, byte* data, int len) {
     if (strlen(s)) printf("[%s]:\n", s);
     for (int i=0; i<len; i++) 
@@ -1462,34 +1224,5 @@ void E2_print_(char* s, const E2* p, const int jacob) {
     Fp2_print_("", &(a.y));
     if (jacob) Fp2_print_("", &(a.z));
 }
- 
-
-void fp_print_(char* s, fp_st a) {
-    char* str = malloc(sizeof(char) * fp_size_str(a, 16));
-    fp_write_str(str, 100, a, 16);
-    printf("[%s]:\n%s\n", s, str);
-    free(str);
-}
-
-void fp12_print_(char* s, fp12_t p) {
-    printf("[%s]:\n", s);
-    fp12_print(p);
-}
 
-void bn_print_(char* s, bn_st *a) {
-    char* str = malloc(sizeof(char) * bn_size_str(a, 16));
-    bn_write_str(str, 128, a, 16);
-    printf("[%s]:\n%s\n", s, str);
-    free(str);
-}
-
-void ep_print_(char* s, ep_st* p) {
-    printf("[%s]:\n", s);
-    g1_print(p);
-}
-
-void ep2_print_(char* s, ep2_st* p) {
-    printf("[%s]:\n", s);
-    g2_print(p);
-}
 #endif
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index e83359263ea..794be4fb705 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -51,11 +51,6 @@ type scalar C.Fr
 // BLS12-381 related lengths
 var frBytesLen = int(C.get_Fr_BYTES())
 
-// context required for the BLS set-up
-type ctx struct {
-	relicCtx *C.ctx_t
-}
-
 // get some constants from the C layer
 // (Cgo does not export C macros)
 var valid = C.get_valid()
@@ -80,16 +75,6 @@ func (p *pointE2) String() string {
 	return fmt.Sprintf("%#x", encoding)
 }
 
-// initContext sets relic B12_381 parameters and precomputes some data in the C layer
-func (ct *ctx) initContext() error {
-	c := C.relic_init_BLS12_381()
-	if c == nil {
-		return errors.New("Relic core init failed")
-	}
-	ct.relicCtx = c
-	return nil
-}
-
 // Scalar multiplication of a generic point `p` in G1
 func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) {
 	C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo))
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index c1914fbedfd..9670d922b15 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -42,21 +42,13 @@
 #define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
 
-// TODO: to delete when Relic is removed
-ep2_st*     E2_blst_to_relic(const E2* x);
-ep_st*      E1_blst_to_relic(const E1* x);
-
 int      get_valid();
 int      get_invalid();
 int      get_Fr_BYTES();
 int      get_mapToG1_input_len();
 
 // BLS based SPoCK
-int bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
-
-// hash to curve functions (functions in bls12381_hashtocurve.c)
-#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
-int     map_to_G1(E1*, const byte*, const int);
+int     bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 
 // Fr utilities
 extern const Fr BLS12_381_rR;
@@ -103,6 +95,9 @@ BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int);
 void        E1_write_bytes(byte *, const E1*);
 void        unsafe_map_bytes_to_G1(E1*, const byte*, int);
 BLST_ERROR  unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
+// hash to curve functions (functions in bls12381_hashtocurve.c)
+#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
+int         map_to_G1(E1*, const byte*, const int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
@@ -129,13 +124,12 @@ bool_t      Fp12_is_one(Fp12*);
 void        Fp12_set_one(Fp12*);
 void        multi_pairing(Fp12*, const E1*, const E2*, const int);
 
-// Utility functions
-ctx_t*   relic_init_BLS12_381();
-
 // utility testing function
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
 
 // Debugging related functions
+#define DEBUG 0
+#if (DEBUG == 1)
 void     bytes_print_(char*, byte*, int);
 void     Fr_print_(char*, Fr*);
 void     Fp_print_(char*, const Fp*);
@@ -143,10 +137,6 @@ void     Fp2_print_(char*, const Fp2*);
 void     Fp12_print_(char*, const Fp12*);
 void     E1_print_(char*, const E1*, const int);
 void     E2_print_(char*, const E2*, const int);
-void     fp_print_(char*, fp_t);
-void     bn_print_(char*, bn_st*);
-void     ep_print_(char*, ep_st*);
-void     ep2_print_(char*, ep2_st*);
-void     fp12_print_(char* s, fp12_t p);
+#endif // DEBUG
 
 #endif
\ No newline at end of file
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 4c73e1131b2..e4f1be2dfa4 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -290,8 +290,6 @@ static void free_tree(node* root) {
     // as an entire array in `bls_batch_verify`.
     if (root->left) {   // no need to check the right child for the leaf check because
                         //  the recursive build starts with the left side first
-        // relic free 
-        if (root->sig) ep_free(root->sig);
         // pointer free
         free(root->sig);
         free(root->pk);

From b6b90d61c131db2451dfc5f6d262bc6a6028b1ab Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 23 May 2023 17:39:43 -0600
Subject: [PATCH 103/200] uncomment a test

---
 crypto/bls_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index c3abbcfb673..377683addf2 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -19,7 +19,7 @@ import (
 // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify)
 func TestBLSMainMethods(t *testing.T) {
 	// test the key generation seed lengths
-	//testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen)
+	testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen)
 	// test the consistency with different inputs
 	hasher := NewExpandMsgXOFKMAC128("test tag")
 	testGenSignVerify(t, BLSBLS12381, hasher)

From 3ec1ecfdda9d6a52bb8128346e4f5fabb2675bf3 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 14:18:07 -0600
Subject: [PATCH 104/200] remove relic tag and delete non-needed files

---
 crypto/bls.go                      |  15 ---
 crypto/bls12381_utils.c            |   2 -
 crypto/bls12381_utils.go           |   3 -
 crypto/bls12381_utils.h            |   2 -
 crypto/bls12381_utils_test.go      |   3 -
 crypto/bls_core.c                  |   2 -
 crypto/bls_crossBLST_test.go       |   3 -
 crypto/bls_include.h               |   2 -
 crypto/bls_multisig.go             |   3 -
 crypto/bls_no_relic.go             | 156 -----------------------------
 crypto/bls_no_relic_test.go        |  42 --------
 crypto/bls_test.go                 |   3 -
 crypto/bls_thresholdsign.go        |   3 -
 crypto/bls_thresholdsign_core.c    |   2 -
 crypto/bls_thresholdsign_include.h |   2 -
 crypto/bls_thresholdsign_test.go   |   3 -
 crypto/blst_include.h              |   2 -
 crypto/blst_src/blst_src.c         |   2 -
 crypto/dkg_core.c                  |   2 -
 crypto/dkg_feldmanvss.go           |   3 -
 crypto/dkg_feldmanvssq.go          |   3 -
 crypto/dkg_include.h               |   2 -
 crypto/dkg_jointfeldman.go         |   3 -
 crypto/dkg_test.go                 |   3 -
 crypto/ecdsa_test.go               |   3 -
 crypto/sign.go                     |  35 +++----
 crypto/sign_norelic.go             |  13 ---
 crypto/sign_relic.go               |  42 --------
 crypto/spock.go                    |   3 -
 crypto/spock_test.go               |   3 -
 30 files changed, 18 insertions(+), 347 deletions(-)
 delete mode 100644 crypto/bls_no_relic.go
 delete mode 100644 crypto/bls_no_relic_test.go
 delete mode 100644 crypto/sign_norelic.go
 delete mode 100644 crypto/sign_relic.go

diff --git a/crypto/bls.go b/crypto/bls.go
index c4be5a3aa85..c2e43aee908 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // BLS signature scheme implementation using BLS12-381 curve
@@ -39,7 +36,6 @@ import "C"
 import (
 	"bytes"
 	"crypto/sha256"
-	"errors"
 	"fmt"
 
 	"golang.org/x/crypto/hkdf"
@@ -531,17 +527,6 @@ var signatureLengthBLSBLS12381 = int(C.get_signature_len())
 var pubKeyLengthBLSBLS12381 = int(C.get_pk_len())
 var prKeyLengthBLSBLS12381 = int(C.get_sk_len())
 
-// init sets the context of BLS12-381 curve
-func (a *blsBLS12381Algo) init() error {
-	// compare the Go and C layer constants as a sanity check
-	if signatureLengthBLSBLS12381 != SignatureLenBLSBLS12381 ||
-		pubKeyLengthBLSBLS12381 != PubKeyLenBLSBLS12381 ||
-		prKeyLengthBLSBLS12381 != PrKeyLenBLSBLS12381 {
-		return errors.New("BLS-12381 length settings in Go and C are not consistent, check hardcoded lengths and compressions")
-	}
-	return nil
-}
-
 // This is only a TEST function.
 // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field.
 //
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 7055a7efa1b..4181d69fbc7 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1,5 +1,3 @@
-// +build relic
-
 // this file contains utility functions for the curve BLS 12-381
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 794be4fb705..ff60bede6d5 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // this file contains utility functions for the curve BLS 12-381
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 9670d922b15..826872c0e7c 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -1,5 +1,3 @@
-// +build relic
-
 // this file contains utility functions for the curve BLS 12-381
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index ae1b240d8ae..69d7e687f9b 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index e4f1be2dfa4..6711320cf51 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -1,5 +1,3 @@
-// +build relic
-
 #include "bls_include.h"
 
 // this file is about the core functions required by the BLS signature scheme
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index ffdb156e251..6d3f1765e25 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // This file contains tests against the library BLST (https://github.com/supranational/blst).
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 3fc56062ab5..7060ac10bdc 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -1,5 +1,3 @@
-// +build relic
-
 // this file is about the core functions required by the BLS signature scheme
 
 #ifndef _REL_BLS_INCLUDE_H
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 0981103120a..aa8e669924a 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (
diff --git a/crypto/bls_no_relic.go b/crypto/bls_no_relic.go
deleted file mode 100644
index fed6c216398..00000000000
--- a/crypto/bls_no_relic.go
+++ /dev/null
@@ -1,156 +0,0 @@
-//go:build !relic
-// +build !relic
-
-package crypto
-
-import (
-	"github.com/onflow/flow-go/crypto/hash"
-)
-
-// The functions below are the non-Relic versions of the public APIs
-// requiring the Relic library.
-// All BLS functionalities in the package require the Relic dependency,
-// and therefore the "relic" build tag.
-// Building without the "relic" tag is successful, but and calling one of the
-// BLS functions results in a runtime panic. This allows projects depending on the
-// crypto library to build successfully with or without the "relic" tag.
-
-const relic_panic = "function is not supported when building without \"relic\" Go build tag"
-
-const (
-	SignatureLenBLSBLS12381 = 48
-)
-
-// bls.go functions
-func NewExpandMsgXOFKMAC128(tag string) hash.Hasher {
-	panic(relic_panic)
-}
-
-func BLSInvalidSignature() Signature {
-	panic(relic_panic)
-}
-
-// bls_multisig.go functions
-func BLSGeneratePOP(sk PrivateKey) (Signature, error) {
-	panic(relic_panic)
-}
-
-func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) {
-	panic(relic_panic)
-}
-
-func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
-	panic(relic_panic)
-}
-
-func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
-	panic(relic_panic)
-}
-
-func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
-	panic(relic_panic)
-}
-
-func IdentityBLSPublicKey() PublicKey {
-	panic(relic_panic)
-}
-
-func IsBLSAggregateEmptyListError(err error) bool {
-	panic(relic_panic)
-}
-
-func IsInvalidSignatureError(err error) bool {
-	panic(relic_panic)
-}
-
-func IsNotBLSKeyError(err error) bool {
-	panic(relic_panic)
-}
-
-func IsBLSSignatureIdentity(s Signature) bool {
-	panic(relic_panic)
-}
-
-func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) {
-	panic(relic_panic)
-}
-
-func VerifyBLSSignatureOneMessage(pks []PublicKey, s Signature,
-	message []byte, kmac hash.Hasher) (bool, error) {
-	panic(relic_panic)
-}
-
-func VerifyBLSSignatureManyMessages(pks []PublicKey, s Signature,
-	messages [][]byte, kmac []hash.Hasher) (bool, error) {
-	panic(relic_panic)
-}
-
-func BatchVerifyBLSSignaturesOneMessage(pks []PublicKey, sigs []Signature,
-	message []byte, kmac hash.Hasher) ([]bool, error) {
-	panic(relic_panic)
-}
-
-func SPOCKProve(sk PrivateKey, data []byte, kmac hash.Hasher) (Signature, error) {
-	panic(relic_panic)
-}
-
-func SPOCKVerifyAgainstData(pk PublicKey, proof Signature, data []byte, kmac hash.Hasher) (bool, error) {
-	panic(relic_panic)
-}
-
-func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signature) (bool, error) {
-	panic(relic_panic)
-}
-
-// bls_threshold.go functions
-func NewBLSThresholdSignatureParticipant(
-	groupPublicKey PublicKey,
-	sharePublicKeys []PublicKey,
-	threshold int,
-	myIndex int,
-	myPrivateKey PrivateKey,
-	message []byte,
-	dsTag string,
-) (ThresholdSignatureParticipant, error) {
-	panic(relic_panic)
-}
-
-func NewBLSThresholdSignatureInspector(
-	groupPublicKey PublicKey,
-	sharePublicKeys []PublicKey,
-	threshold int,
-	message []byte,
-	dsTag string,
-) (ThresholdSignatureInspector, error) {
-	panic(relic_panic)
-}
-
-func BLSReconstructThresholdSignature(size int, threshold int,
-	shares []Signature, signers []int) (Signature, error) {
-	panic(relic_panic)
-}
-
-func EnoughShares(threshold int, sharesNumber int) (bool, error) {
-	panic(relic_panic)
-}
-
-func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey,
-	[]PublicKey, PublicKey, error) {
-	panic(relic_panic)
-}
-
-// dkg.go functions
-func NewFeldmanVSS(size int, threshold int, myIndex int,
-	processor DKGProcessor, dealerIndex int) (DKGState, error) {
-	panic(relic_panic)
-}
-
-func NewFeldmanVSSQual(size int, threshold int, myIndex int,
-	processor DKGProcessor, dealerIndex int) (DKGState, error) {
-	panic(relic_panic)
-}
-
-func NewJointFeldman(size int, threshold int, myIndex int,
-	processor DKGProcessor) (DKGState, error) {
-	panic(relic_panic)
-}
diff --git a/crypto/bls_no_relic_test.go b/crypto/bls_no_relic_test.go
deleted file mode 100644
index 47f8120060f..00000000000
--- a/crypto/bls_no_relic_test.go
+++ /dev/null
@@ -1,42 +0,0 @@
-//go:build !relic
-// +build !relic
-
-package crypto
-
-import (
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-// Test for all public APIs requiring relic build tag.
-// These functions should panic if build without the relic tag.
-func TestNoRelicPanic(t *testing.T) {
-	assert.PanicsWithValue(t, relic_panic, func() { NewExpandMsgXOFKMAC128("") })
-	assert.PanicsWithValue(t, relic_panic, func() { BLSInvalidSignature() })
-	assert.PanicsWithValue(t, relic_panic, func() { BLSGeneratePOP(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { BLSVerifyPOP(nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSSignatures(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPrivateKeys(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPublicKeys(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { IdentityBLSPublicKey() })
-	assert.PanicsWithValue(t, relic_panic, func() { IsBLSAggregateEmptyListError(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { IsInvalidSignatureError(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { IsNotBLSKeyError(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { IsBLSSignatureIdentity(nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { RemoveBLSPublicKeys(nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureOneMessage(nil, nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureManyMessages(nil, nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { BatchVerifyBLSSignaturesOneMessage(nil, nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { SPOCKProve(nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerify(nil, nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerifyAgainstData(nil, nil, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureParticipant(nil, nil, 0, 0, nil, nil, "") })
-	assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureInspector(nil, nil, 0, nil, "") })
-	assert.PanicsWithValue(t, relic_panic, func() { BLSReconstructThresholdSignature(0, 0, nil, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { EnoughShares(0, 0) })
-	assert.PanicsWithValue(t, relic_panic, func() { BLSThresholdKeyGen(0, 0, nil) })
-	assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSS(0, 0, 0, nil, 0) })
-	assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSSQual(0, 0, 0, nil, 0) })
-	assert.PanicsWithValue(t, relic_panic, func() { NewJointFeldman(0, 0, 0, nil) })
-}
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 377683addf2..801af0a24a5 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 1d19ca42504..3cef4d4e605 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // #cgo CFLAGS:
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 027579d3dae..e160a16e7c9 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -1,5 +1,3 @@
-// +build relic
-
 #include "bls_thresholdsign_include.h"
 
 // the highest index of a threshold participant
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index ce88c460f95..1275b10bab4 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -1,5 +1,3 @@
-// +build relic
-
 #ifndef _REL_THRESHOLD_INCLUDE_H
 #define _REL_THRESHOLD_INCLUDE_H
 
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 9db32e0fe85..3e55f3d1806 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 1f7b2484a3c..e408c9c0c70 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -1,5 +1,3 @@
-// +build relic
-
 #ifndef __BLST_INCLUDE_H__
 #define __BLST_INCLUDE_H__
 
diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c
index 4b0732e06e4..b904a5d52ee 100644
--- a/crypto/blst_src/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -1,5 +1,3 @@
-// +build relic
-
 #include "keygen.c"
 #include "hash_to_field.c"
 #include "e1.c"
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 89f09e35da0..2b34572089c 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -1,5 +1,3 @@
-// +build relic
-
 #include "dkg_include.h"
 
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 64f2a11c383..dd81bbcd79c 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // #cgo CFLAGS:
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 69393768fe5..620c962faaa 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // #cgo CFLAGS:
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index e8489fbf669..ca6619eb10f 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -1,5 +1,3 @@
-// +build relic
-
 #ifndef _REL_DKG_INCLUDE_H
 #define _REL_DKG_INCLUDE_H
 
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 8de9695a0c5..c4fb23f578e 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // #cgo CFLAGS:
diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go
index 32a1b9982b4..2bd4dc51fa0 100644
--- a/crypto/dkg_test.go
+++ b/crypto/dkg_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (
diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go
index 342162668cf..d5d38f8e947 100644
--- a/crypto/ecdsa_test.go
+++ b/crypto/ecdsa_test.go
@@ -1,6 +1,3 @@
-//go:build !relic
-// +build !relic
-
 package crypto
 
 import (
diff --git a/crypto/sign.go b/crypto/sign.go
index 68196acba2d..788a55618d4 100644
--- a/crypto/sign.go
+++ b/crypto/sign.go
@@ -49,20 +49,22 @@ type signer interface {
 	decodePublicKeyCompressed([]byte) (PublicKey, error)
 }
 
-// newNonRelicSigner returns a signer that does not depend on Relic library.
-func newNonRelicSigner(algo SigningAlgorithm) (signer, error) {
+// newSigner returns a signer that does not depend on Relic library.
+func newSigner(algo SigningAlgorithm) (signer, error) {
 	switch algo {
 	case ECDSAP256:
 		return p256Instance, nil
 	case ECDSASecp256k1:
 		return secp256k1Instance, nil
+	case BLSBLS12381:
+		return blsInstance, nil
 	default:
 		return nil, invalidInputsErrorf("the signature scheme %s is not supported", algo)
 	}
 }
 
 // Initialize the context of all algos not requiring Relic
-func initNonRelic() {
+func init() {
 	// P-256
 	p256Instance = &(ecdsaAlgo{
 		curve: elliptic.P256(),
@@ -74,19 +76,10 @@ func initNonRelic() {
 		curve: btcec.S256(),
 		algo:  ECDSASecp256k1,
 	})
-}
 
-// Signature format Check for non-relic algos (ECDSA)
-func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, error) {
-	switch algo {
-	case ECDSAP256:
-		return p256Instance.signatureFormatCheck(s), nil
-	case ECDSASecp256k1:
-		return secp256k1Instance.signatureFormatCheck(s), nil
-	default:
-		return false, invalidInputsErrorf(
-			"the signature scheme %s is not supported",
-			algo)
+	// bls12-381
+	blsInstance = &blsBLS12381Algo{
+		algo: BLSBLS12381,
 	}
 }
 
@@ -98,8 +91,16 @@ func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, err
 // If SignatureFormatCheck returns false then the input is not a valid
 // signature and will fail a verification against any message and public key.
 func SignatureFormatCheck(algo SigningAlgorithm, s Signature) (bool, error) {
-	// For now, signatureFormatCheckNonRelic is only defined for non-Relic algos.
-	return signatureFormatCheckNonRelic(algo, s)
+	switch algo {
+	case ECDSAP256:
+		return p256Instance.signatureFormatCheck(s), nil
+	case ECDSASecp256k1:
+		return secp256k1Instance.signatureFormatCheck(s), nil
+	default:
+		return false, invalidInputsErrorf(
+			"the signature scheme %s is not supported",
+			algo)
+	}
 }
 
 // GeneratePrivateKey generates a private key of the algorithm using the entropy of the given seed.
diff --git a/crypto/sign_norelic.go b/crypto/sign_norelic.go
deleted file mode 100644
index 7e6dd4c0d10..00000000000
--- a/crypto/sign_norelic.go
+++ /dev/null
@@ -1,13 +0,0 @@
-//go:build !relic
-// +build !relic
-
-package crypto
-
-// newSigner chooses and initializes a signature scheme
-func newSigner(algo SigningAlgorithm) (signer, error) {
-	return newNonRelicSigner(algo)
-}
-
-func init() {
-	initNonRelic()
-}
diff --git a/crypto/sign_relic.go b/crypto/sign_relic.go
deleted file mode 100644
index 980fca20c51..00000000000
--- a/crypto/sign_relic.go
+++ /dev/null
@@ -1,42 +0,0 @@
-//go:build relic
-// +build relic
-
-package crypto
-
-import (
-	"fmt"
-)
-
-// newSigner chooses and initializes a signature scheme
-func newSigner(algo SigningAlgorithm) (signer, error) {
-	// try Relic algos
-	if signer := relicSigner(algo); signer != nil {
-		return signer, nil
-	}
-	// return a non-Relic algo
-	return newNonRelicSigner(algo)
-}
-
-// relicSigner returns a signer that depends on Relic library.
-func relicSigner(algo SigningAlgorithm) signer {
-	if algo == BLSBLS12381 {
-		return blsInstance
-	}
-	return nil
-}
-
-// Initialize Relic with the BLS context on BLS 12-381
-func init() {
-	initRelic()
-	initNonRelic()
-}
-
-// Initialize the context of all algos requiring Relic
-func initRelic() {
-	blsInstance = &blsBLS12381Algo{
-		algo: BLSBLS12381,
-	}
-	if err := blsInstance.init(); err != nil {
-		panic(fmt.Sprintf("initialization of BLS failed: %s", err.Error()))
-	}
-}
diff --git a/crypto/spock.go b/crypto/spock.go
index 4fbd974c27f..46673b0bb13 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 // SPoCK design based on the BLS signature scheme.
diff --git a/crypto/spock_test.go b/crypto/spock_test.go
index 596968234e4..75de3dea838 100644
--- a/crypto/spock_test.go
+++ b/crypto/spock_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package crypto
 
 import (

From 64112cca8a4c4854ccdbc6d5924214dcfc332d07 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 14:22:00 -0600
Subject: [PATCH 105/200] remove relic build scripts

---
 crypto/bls12381_utils.go   |  4 +-
 crypto/build_dependency.sh | 36 ---------------
 crypto/relic_build.sh      | 90 --------------------------------------
 3 files changed, 1 insertion(+), 129 deletions(-)
 delete mode 100644 crypto/build_dependency.sh
 delete mode 100755 crypto/relic_build.sh

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index ff60bede6d5..d56be090332 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,8 +4,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
-// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
@@ -37,7 +36,6 @@ import (
 )
 
 // Go wrappers around BLST C types
-// Go wrappers around Relic C types
 type pointE1 C.E1
 type pointE2 C.E2
 type scalar C.Fr
diff --git a/crypto/build_dependency.sh b/crypto/build_dependency.sh
deleted file mode 100644
index 4bfe99dbad2..00000000000
--- a/crypto/build_dependency.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-PKG_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-RELIC_DIR_NAME="relic"
-RELIC_DIR="${PKG_DIR}/${RELIC_DIR_NAME}"
-
-# grant permissions if not existant
-if [[ ! -r ${PKG_DIR}  || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then
-   chmod -R 755 "${PKG_DIR}"
-fi
-
-rm -rf "${RELIC_DIR}"
-
-# relic version or tag
-relic_version="7d885d1ba34be61bf22190943a73549a910c1714"
-
-# clone a specific version of Relic without history if it's tagged.
-# git -c http.sslVerify=true clone --branch $(relic_version) --single-branch --depth 1 https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; }
-
-# clone all the history if the version is only defined by a commit hash.
-git -c http.sslVerify=true clone --branch main --single-branch https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; }
-
-if [ -d "${RELIC_DIR}" ]
-then
-   (
-      cd ${RELIC_DIR_NAME} || { echo "cd relic failed"; exit 1; }
-      git checkout $relic_version
-   )
-   # build relic
-   bash relic_build.sh
-else 
-   { echo "couldn't find relic directory"; exit 1; }
-fi
-
diff --git a/crypto/relic_build.sh b/crypto/relic_build.sh
deleted file mode 100755
index 6cff3a6b478..00000000000
--- a/crypto/relic_build.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-pushd "$DIR"
-
-# Ensure the directory is writeable
-chmod -R +w "$(pwd)"
-
-mkdir -p "$DIR/relic/build"
-pushd "$DIR/relic/build"
-
-
-# make cmake print its CC interpretation
-CMAKE_FILE="${DIR}/relic/CMakeLists.txt"
-# parameter expansion is not suitable here
-# shellcheck disable=SC2089
-CMAKE_PRINT_CC="message ( STATUS \"CC=\$ENV{CC}\" )"
-# Make the cmake run print its interpretation of CC
-echo "$CMAKE_PRINT_CC" >> "${CMAKE_FILE}"
-
-# Probe cmake's MakeFile generation and extract the CC version
-CMAKE_TEMP=$(mktemp)
-cmake .. > "$CMAKE_TEMP"
-CC_VAL="$(tail -n 5 "$CMAKE_TEMP" | grep -oE -m 1 'CC=.*$')"
-CC_VAL="${CC_VAL:3}"
-
-# de-mangle the CMakeLists file, using a temporary file for BSD compatibility
-sed '$d' ../CMakeLists.txt > "$CMAKE_TEMP"
-mv "$CMAKE_TEMP" ../CMakeLists.txt
-
-# default to which
-CC_VAL=${CC_VAL:-"$(which cc)"}
-CC_VERSION_STR="$($CC_VAL --version)"
-
-# we use uname to record which arch we are running on
-ARCH=$(uname -m 2>/dev/null || true)
-
-if [[ "$ARCH" =~ "x86_64" ]]; then
-    # Compile as westmere arch to avoid cross-compilation issues on machines not supporting AVX extensions.
-    # Relic performance as used in flow crypto library is not impacted by whether it is compiled with "native" or "westmere", as proven by benchmark results.  
-    MARCH="-march=westmere"
-elif [[ "$ARCH" =~ ^(arm64|armv7|armv7s)$ && "${CC_VERSION_STR[0]}" =~ (clang)  ]]; then
-    #  the "-march=native" option is not supported with clang on ARM
-    MARCH=""
-else
-    MARCH="-march=native"
-fi
-
-# Set RELIC config for Flow
-COMP=(-DCFLAGS="-O3 -funroll-loops -fomit-frame-pointer ${MARCH} -mtune=native")
-GENERAL=(-DTIMER=CYCLE -DCHECK=OFF -DVERBS=OFF)
-LIBS=(-DSHLIB=OFF -DSTLIB=ON)
-RAND=(-DRAND=HASHD -DSEED=)
-
-#
-BN_REP=(-DALLOC=AUTO -DALIGN=1 -DWSIZE=64 -DBN_PRECI=1024 -DBN_MAGNI=DOUBLE)
-ARITH=(-DARITH=EASY)
-PRIME=(-DFP_PRIME=381)
-
-#
-BN_METH=(-DBN_KARAT=0 -DBN_METHD="COMBA;COMBA;MONTY;SLIDE;BINAR;BASIC")
-FP_METH=(-DFP_KARAT=0 -DFP_METHD="INTEG;INTEG;INTEG;MONTY;MONTY;JMPDS;SLIDE")
-PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON)
-FPX_METH=(-DFPX_METHD="INTEG;INTEG;LAZYR")
-EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF\
-    -DEP_CTMAP=ON -DEP_METHD="JACOB;LWNAF;COMBS;INTER")
-PP_METH=(-DPP_METHD="LAZYR;OATEP")
-
-# run cmake
-cmake "${COMP[@]}" "${GENERAL[@]}" \
-        "${LIBS[@]}" "${RAND[@]}" \
-        "${BN_REP[@]}" "${ARITH[@]}" \
-        "${PRIME[@]}" "${PRIMES[@]}" \
-        "${EP_METH[@]}" \
-        "${BN_METH[@]}" \
-        "${FP_METH[@]}" \
-        "${FPX_METH[@]}" \
-        "${PP_METH[@]}" ..
-
-
-# Compile the static library
-make clean
-make relic_s -j8
-rm -f CMakeCache.txt
-
-popd
-popd

From 19a21db959b5fdfb68e30d5350bbc50648247f16 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 14:59:13 -0600
Subject: [PATCH 106/200] remove relic macros and xmd_sha256, remove relic
 binray from LD flags

---
 crypto/bls.go              | 2 --
 crypto/bls12381_utils.c    | 6 +++---
 crypto/bls12381_utils.go   | 1 +
 crypto/bls12381_utils.h    | 6 +++---
 crypto/bls_include.h       | 1 -
 crypto/bls_multisig.go     | 3 +--
 crypto/dkg_jointfeldman.go | 2 --
 crypto/spock.go            | 2 --
 8 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index c2e43aee908..f49f4661772 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -28,8 +28,6 @@ package crypto
 //  - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf)
 //  - implement a G1/G2 swap (signatures on G2 and public keys on G1)
 
-// #cgo CFLAGS:
-// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls_include.h"
 import "C"
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 4181d69fbc7..a6b1e5c5e44 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1154,10 +1154,10 @@ void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) {
     final_exp(res_vec, res_vec);
 }
 
-// This is a testing function.
-// It wraps a call to a Relic macro since cgo can't call macros.
+// This is a testing function and is not used in exported functions
+// It uses an expand message XMD based on SHA2-256.
 void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){
-    md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst);
+    expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst);
 }
 
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index d56be090332..9695d45aba2 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -12,6 +12,7 @@ package crypto
 // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__))
 // # include <signal.h>
 // # include <unistd.h>
+// # include <string.h>
 // static void handler(int signum)
 // {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__";
 //		ssize_t n = write(2, &text, strlen(text));
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 826872c0e7c..d2f2d8b489f 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -5,12 +5,12 @@
 #ifndef _REL_MISC_INCLUDE_H
 #define _REL_MISC_INCLUDE_H
 
-#include "relic.h"
+#include <string.h>
 #include "blst_include.h"
 
 #define SEC_BITS  128
-#define VALID     RLC_OK
-#define INVALID   RLC_ERR
+#define VALID     0
+#define INVALID   1
 #define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR
 
 #define BITS_TO_BYTES(x) ((x+7)>>3)
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 7060ac10bdc..4b8e1075501 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -3,7 +3,6 @@
 #ifndef _REL_BLS_INCLUDE_H
 #define _REL_BLS_INCLUDE_H
 
-#include "relic.h"
 #include "bls12381_utils.h"
 
 // Signature, Public key and Private key lengths
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index aa8e669924a..e451c8d41f5 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -30,8 +30,7 @@ import (
 //  - batch verification of multiple signatures of a single message under multiple
 //  public keys: use a binary tree of aggregations to find the invalid signatures.
 
-// #cgo CFLAGS:
-// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
+// #include "bls12381_utils.h"
 // #include "bls_include.h"
 import "C"
 
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index c4fb23f578e..40db316efb5 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -1,7 +1,5 @@
 package crypto
 
-// #cgo CFLAGS:
-// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "dkg_include.h"
 import "C"
 
diff --git a/crypto/spock.go b/crypto/spock.go
index 46673b0bb13..8180b9b72bd 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -3,8 +3,6 @@ package crypto
 // SPoCK design based on the BLS signature scheme.
 // BLS is using BLS12-381 curve and the same settings in bls.go.
 
-// #cgo CFLAGS:
-// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s
 // #include "bls_include.h"
 import "C"
 import (

From e58fe245920f8be1fbc7a4759ab3bf3c723062e1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 17:33:01 -0600
Subject: [PATCH 107/200] update Makefile and dockerignore

---
 crypto/.dockerignore |  1 -
 crypto/Makefile      | 34 ++++++----------------------------
 2 files changed, 6 insertions(+), 29 deletions(-)
 delete mode 100644 crypto/.dockerignore

diff --git a/crypto/.dockerignore b/crypto/.dockerignore
deleted file mode 100644
index 5c75f82093a..00000000000
--- a/crypto/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-relic/build
diff --git a/crypto/Makefile b/crypto/Makefile
index d87f27c440f..a75e00df15b 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -12,41 +12,19 @@ endif
 
 ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
 
-.PHONY: setup
-setup:
-	go generate
-
-# test BLS-related functionalities requiring the Relic library (and hence relic Go build flag)
-.PHONY: relic_tests
-relic_tests:
+# test all packages
+.PHONY: test
+test:
+# root package (it uses BLST source files underneath which requires testing for ADX support)
 ifeq ($(ADX_SUPPORT), 1)
-	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,)
+	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
 else
-	CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,)
+	CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
 endif
-
-# test all packages that do not require Relic library (all functionalities except the BLS-related ones)
-.PHONY: non_relic_tests
-non_relic_tests:
-# root package without relic 
-	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
 # sub packages
 	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash
 	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random
 
-############################################################################################
-# CAUTION: DO NOT MODIFY THIS TARGET! DOING SO WILL BREAK THE FLAKY TEST MONITOR
-
-# sets up the crypto module and runs all tests
-.PHONY: test
-test: setup unittest
-
-# runs the unit tests of the module (assumes the module was set up)
-.PHONY: unittest
-unittest: relic_tests non_relic_tests
-
-############################################################################################
-
 .PHONY: docker-build
 docker-build:
 	docker build -t gcr.io/dl-flow/golang-cmake:latest -t gcr.io/dl-flow/golang-cmake:$(IMAGE_TAG) .

From 81239706a01f8afa320f251dc3153195f86e5a7a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 18:17:59 -0600
Subject: [PATCH 108/200] remove Relic mentions in code and README

---
 crypto/README.md       | 91 +++++-------------------------------------
 crypto/bls.go          | 24 +++++------
 crypto/bls_multisig.go | 13 +++---
 crypto/sign.go         |  4 +-
 4 files changed, 31 insertions(+), 101 deletions(-)

diff --git a/crypto/README.md b/crypto/README.md
index 9f29ad03e16..97156fa52c9 100644
--- a/crypto/README.md
+++ b/crypto/README.md
@@ -6,86 +6,22 @@ Most of the primitives and protocols can be used in other projects and are not s
 Flow is an ongoing project, which means that new features will still be added and modifications will still be made to improve security and performance of the cryptography package.
 
 Notes:
-   - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a few improvements since. 
+   - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a major refactor to switch all the BLS12-381 curve implementation to use [BLST](https://github.com/supranational/blst/tree/master/src) starting from [this version](TODO: link the commit/tag). 
    - The package does not provide security against side channel or fault attacks.
 
 ## Package import
 
-Cloning Flow repository and following the [installation steps](https://github.com/onflow/flow-go) builds the necessary tools to use Flow cryptography.
+To use the Flow cryptography package, you can:
 
-If you wish to only import the Flow cryptography package into your Go project, please follow the following steps:
-
-- Get Flow cryptography package 
+- get the package 
 ```
 go get github.com/onflow/flow-go/crypto
 ```
-or simply import the package to your Go project
+- or simply import the package to your Go project
  ```
 import "github.com/onflow/flow-go/crypto"
 ```
 
-This is enough to run the package code for many functionalities. However, this isn't enough if BLS signature related functionalities are used. The BLS features rely on an extrnal C library ([Relic](https://github.com/relic-toolkit/relic)) for lower level mathematical operations. Building your project at this stage including BLS functionalities would result in build errors related to missing "relic" files. For instance:
-```
-fatal error: 'relic.h' file not found
-#include "relic.h"
-         ^~~~~~~~~
-```
-
- An extra step is required to compile the external dependency (Relic) locally. 
-
-- Install [CMake](https://cmake.org/install/), which is used for building the package. The build also requires [Git](http://git-scm.com/) and bash scripting.  
-- From the Go package directory in `$GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@<version-tag>/`, build the package dependencies. `version-tag` is the imported package version. 
-For instance:
-```
-cd $GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@v0.25.0/
-go generate
-```
-
-Below is a bash script example to automate the above steps. The script can be copied into your Go project root directory.
-It extracts the imported pacakage version from your project's go.mod file and performs the remaining steps. 
-```bash
-#!/bin/bash
-
-# crypto package 
-PKG_NAME="github.com/onflow/flow-go/crypto"
-
-# go get the package
-go get ${PKG_NAME}
-
-# go.mod
-MOD_FILE="./go.mod"
-
-# the version of onflow/flow-go/crypto used in the project is read from the go.mod file
-if [ -f "${MOD_FILE}" ]
-then
-    # extract the version from the go.mod file
-    VERSION="$(grep ${PKG_NAME} < ${MOD_FILE} | cut -d' ' -f 2)"
-    # using the right version, get the package directory path
-    PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}"
-else 
-   { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; }
-fi
-
-# grant permissions if not existant
-if [[ ! -r ${PKG_DIR}  || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then
-   sudo chmod -R 755 "${PKG_DIR}"
-fi
-
-# get into the package directory and set up the external dependencies
-(
-    cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; }
-    go generate
-)
-``` 
-
-
-Finally, when building your project and including any BLS functionality, adding a Go build tag to include the BLS files in the build is required.
-The tag is not required when the package is used without BLS functions. It was introduced to avoid build errors when BLS (and therefore Relic) is not needed.
-
-```
-go build -tags=relic
-```
-
 ## Algorithms
 
 ### Hashing and Message Authentication Code:
@@ -103,11 +39,11 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey`
 
  * ECDSA
     * public keys are compressed or uncompressed.
-    * ephemeral key is derived from the private key, hash and an external entropy using a CSPRNG (based on https://golang.org/pkg/crypto/ecdsa/).
+    * ephemeral key is derived from the private key, hash and the system entropy (based on https://golang.org/pkg/crypto/ecdsa/).
     * supports NIST P-256 (secp256r1) and secp256k1 curves.
 
  * BLS
-    * supports [BLS 12-381](https://electriccoin.co/blog/new-snark-curve/) curve.
+    * supports [BLS12-381](https://electriccoin.co/blog/new-snark-curve/) curve.
     * is implementing the minimal-signature-size variant:
     signatures in G1 and public keys in G2.
     * default set-up uses [compressed](https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) G1/G2 points, 
@@ -119,16 +55,14 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey`
     and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession.
     * signature verification includes the signature membership check in G1.
     * public key membership check in G2 is provided outside of the signature verification.
-    * membership check in G1 is using [Bowe's fast check](https://eprint.iacr.org/2019/814.pdf), while membership check in G2 is using a simple scalar multiplication by the group order (both will be updated to use Scott's method)
-    * non-interactive aggregation of signatures, public keys and private keys.
-    * multi-signature verification of an aggregated signature of a single message under multiple public keys.
-    * multi-signature verification of an aggregated signature of multiple messages under multiple public keys.
+    * aggregation of signatures, public keys and private keys.
+    * verification of an aggregated signature of a single message under multiple public keys.
+    * verification of an aggregated signature of multiple messages under multiple public keys.
     * batch verification of multiple signatures of a single message under multiple
-    public keys: use a binary tree of aggregations to find the invalid signatures.
+    public keys, using a binary tree of aggregations.
     * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier.
 
  * Future features:
-    * membership checks in G1/G2 using [Scotts's method](https://eprint.iacr.org/2021/1130.pdf).
     * support minimal-pubkey-size variant
 
 ### PRNG
@@ -146,9 +80,6 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey`
     * key generation (single dealer) to provide the set of keys.
     * provides a stateless api and a stateful api.
 
- * Future features:
-    * support a partial signature reconstruction in the stateful api to avoid a long final reconstruction.
-
 
 ### Discrete-Log based distributed key generation
 
@@ -158,7 +89,7 @@ All supported Distributed Key Generation protocols are [discrete log based](http
     * simple verifiable secret sharing with a single dealer.
     * the library does not implement the communication channels between participants. The caller should implement the methods `PrivateSend` (1-to-1 messaging) and `Broadcast` (1-to-n messaging)
     * 1-to-1 messaging must be a private channel, the caller must make sure the channel preserves confidentialiy and authenticates the sender.
-    * 1-to-n broadcasting assume all destination participants receive the same copy of the message. The channel should also authenticate the broadcaster.
+    * 1-to-n broadcasting is a reliable broadcast, where honest senders are able to reach all honest receivers, and where all honest receivers end up with the same received messages. The channel should also authenticate the broadcaster.
     * It is recommended that both communication channels are unique per protocol instance. This could be achieved by prepending the messages to send/broadcast by a unique protocol instance ID.
  * Feldman VSS Qual.
     * an extension of the simple Feldman VSS.
diff --git a/crypto/bls.go b/crypto/bls.go
index f49f4661772..c8650c9dc60 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -1,12 +1,13 @@
 package crypto
 
-// BLS signature scheme implementation using BLS12-381 curve
-// ([zcash]https://electriccoin.co/blog/new-snark-curve/)
-// Pairing, ellipic curve and modular arithmetic is using Relic library.
-// This implementation does not include any security against side-channel attacks.
-
-// existing features:
-//  - the implementation variant is minimal-signature-size signatures:
+// BLS signature scheme implementation using the BLS12-381 curve
+// ([zcash]https://electriccoin.co/blog/new-snark-curve/).
+// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src)
+// tools underneath.
+// This implementation does not include security against side-channel or fault attacks.
+
+// Existing features:
+//  - the implementation variant is minimal-signature-size:
 //    shorter signatures in G1, longer public keys in G2
 //  - serialization of points on G1 and G2 is compressed ([zcash]
 //     https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
@@ -18,15 +19,12 @@ package crypto
 //    and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession.
 //  - signature verification checks the membership of signature in G1.
 //  - the public key membership check in G2 is implemented separately from the signature verification.
-//  - membership check in G1 is implemented using fast Bowe's check (to be updated to Scott's check).
-//  - membership check in G2 is using a simple scalar multiplication with the group order (to be updated to Scott's check).
 //  - multi-signature tools are defined in bls_multisg.go
-//  - SPoCK scheme based on BLS: verifies two signatures have been generated from the same message,
-//    that is unknown to the verifier.
+//  - SPoCK scheme based on BLS: verifies two signatures are generated from the same message,
+//    even though the message is unknown to the verifier.
 
 // future features:
-//  - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf)
-//  - implement a G1/G2 swap (signatures on G2 and public keys on G1)
+//  - implement a G1/G2 swap (minimal-pubkey-size variant)
 
 // #include "bls_include.h"
 import "C"
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index e451c8d41f5..5714b7e2a34 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -15,20 +15,21 @@ import (
 
 // BLS multi-signature using BLS12-381 curve
 // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381)
-// Pairing, ellipic curve and modular arithmetic is using Relic library.
-// This implementation does not include any security against side-channel attacks.
+// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src)
+// tools underneath.
+// This implementation does not include any security against side-channel side-channel or fault attacks.
 
-// existing features:
+// Existing features:
 //  - the same BLS set-up in bls.go
 //  - Use the proof of possession scheme (PoP) to prevent against rogue public-key attack.
-//  - Non-interactive aggregation of private keys, public keys and signatures.
-//  - Non-interactive subtraction of multiple public keys from an (aggregated) public key.
+//  - Aggregation of private keys, public keys and signatures.
+//  - Subtraction of multiple public keys from an (aggregated) public key.
 //  - Multi-signature verification of an aggregated signature of a single message
 //  under multiple public keys.
 //  - Multi-signature verification of an aggregated signature of multiple messages under
 //  multiple public keys.
 //  - batch verification of multiple signatures of a single message under multiple
-//  public keys: use a binary tree of aggregations to find the invalid signatures.
+//  public keys, using a binary tree of aggregations.
 
 // #include "bls12381_utils.h"
 // #include "bls_include.h"
diff --git a/crypto/sign.go b/crypto/sign.go
index 788a55618d4..ff4348f3b09 100644
--- a/crypto/sign.go
+++ b/crypto/sign.go
@@ -49,7 +49,7 @@ type signer interface {
 	decodePublicKeyCompressed([]byte) (PublicKey, error)
 }
 
-// newSigner returns a signer that does not depend on Relic library.
+// newSigner returns a signer instance
 func newSigner(algo SigningAlgorithm) (signer, error) {
 	switch algo {
 	case ECDSAP256:
@@ -63,7 +63,7 @@ func newSigner(algo SigningAlgorithm) (signer, error) {
 	}
 }
 
-// Initialize the context of all algos not requiring Relic
+// Initialize the context of all algos
 func init() {
 	// P-256
 	p256Instance = &(ecdsaAlgo{

From 541df79e5aa0f017cf0e2a55cee38a938eacc66f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 18:23:23 -0600
Subject: [PATCH 109/200] update flow-go/README and gitignore

---
 .gitignore | 2 --
 README.md  | 7 -------
 2 files changed, 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 472cc944ee4..7d437f2c93e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,8 +7,6 @@
 /cmd/util/util
 /cmd/bootstrap/bootstrap
 
-# crypto relic folder
-crypto/relic/
 
 # Test binary, build with `go test -c`
 *.test
diff --git a/README.md b/README.md
index 39bd7a13e3e..291e45de347 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,6 @@ The following table lists all work streams and links to their home directory and
 
 - Clone this repository
 - Install [Go](https://golang.org/doc/install) (Flow supports Go 1.18 and later)
-- Install [CMake](https://cmake.org/install/), which is used for building the crypto library
 - Install [Docker](https://docs.docker.com/get-docker/), which is used for running a local network and integration tests
 - Make sure the [`GOPATH`](https://golang.org/cmd/go/#hdr-GOPATH_environment_variable) and `GOBIN` environment variables
   are set, and `GOBIN` is added to your path:
@@ -75,12 +74,6 @@ The following table lists all work streams and links to their home directory and
 
 At this point, you should be ready to build, test, and run Flow! 🎉
 
-Note: Whenever the crypto module version imported by "go.mod" is updated to a version that was never locally imported before, the crypto dependency needs to be set-up. If not, you should notice errors about "relic" or "crypto". Run the following command to set-up the new module version:
-
-```bash
-make crypto_setup_gopath
-```
-
 ## Development Workflow
 
 ### Testing

From 1d0b8f906de5158f6358cd75543ef582953c9e4d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 18:43:06 -0600
Subject: [PATCH 110/200] remove relic related commands from
 Makefile/ci/dockerfile

---
 .github/workflows/bench.yml                   |  6 +--
 .github/workflows/ci.yml                      | 37 ++++------------
 .github/workflows/flaky-test-debug.yml        | 14 -------
 Makefile                                      | 42 +++++++------------
 cmd/bootstrap/README.md                       |  8 ++--
 cmd/bootstrap/cmd/genconfig.go                |  2 +-
 insecure/Makefile                             |  2 +-
 integration/Makefile                          | 24 +++++------
 integration/benchmark/cmd/manual/Dockerfile   | 16 +------
 integration/benchnet2/Makefile                |  8 ++--
 integration/localnet/Makefile                 |  8 ++--
 module/metrics/example/README.md              |  2 +-
 .../level1/process_summary1_results_test.go   | 12 +++---
 utils/binstat/binstat_external_test.go        |  2 +-
 14 files changed, 61 insertions(+), 122 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index ada29474be7..7c3c6d896bd 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -49,14 +49,14 @@ jobs:
 
       - name: Run benchmark on current branch
         run: |
-          (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done) | tee new.txt
+          (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee new.txt
 
       - name: Checkout base branch
         run: git checkout ${{ github.event.pull_request.base.sha }}
 
       - name: Run benchmark on base branch
         run: |
-          (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done) | tee old.txt
+          (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee old.txt
 
       # see https://trstringer.com/github-actions-multiline-strings/ to see why this part is complex
       - name: Use benchstat for comparison
@@ -85,7 +85,7 @@ jobs:
 
             This branch with compared with the base branch ${{  github.event.pull_request.base.label }} commit ${{ github.event.pull_request.base.sha }}
 
-            The command `(for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done)` was used.
+            The command `(for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done)` was used.
 
             <details>
             <summary>Collapsed results for better readability</summary>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 08832eab401..57b0da2ace2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,8 +38,6 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Build relic
-      run: make crypto_setup_gopath
     - name: Run go generate
       run: go generate
       working-directory: ${{ matrix.dir }}
@@ -48,7 +46,7 @@ jobs:
       with:
         # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version.
         version: v1.49
-        args: -v --build-tags relic
+        args: -v
         working-directory: ${{ matrix.dir }}
         # https://github.com/golangci/golangci-lint-action/issues/244
         skip-cache: true
@@ -66,20 +64,6 @@ jobs:
           cache: true
       - name: Run tidy
         run: make tidy
-      - name: Emulator no relic check
-        run: make emulator-norelic-check
-
-  shell-check:
-    name: ShellCheck
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v3
-    - name: Run ShellCheck
-      uses: ludeeus/action-shellcheck@203a3fd018dfe73f8ae7e3aa8da2c149a5f41c33
-      with:
-        scandir: './crypto'
-        ignore: 'relic'
 
   create-dynamic-test-matrix:
     name: Create Dynamic Test Matrix
@@ -141,18 +125,15 @@ jobs:
       matrix:
         include:
           - name: crypto
-            make1: -C crypto setup
-            make2: unittest
+            setup: 
             retries: 1
             race: 1
           - name: insecure
-            make1: install-tools
-            make2: test
+            setup: install-tools
             retries: 3
             race: 1
           - name: integration
-            make1: install-tools
-            make2: test
+            setup: install-tools
             retries: 3
             race: 0
     runs-on: ubuntu-latest
@@ -165,7 +146,7 @@ jobs:
           go-version: ${{ env.GO_VERSION }}
           cache: true
       - name: Setup tests (${{ matrix.name }})
-        run: make ${{ matrix.make1 }}
+        run: make ${{ matrix.setup }}
       - name: Run tests (${{ matrix.name }})
         env:
           RACE_DETECTOR: ${{ matrix.race }}
@@ -173,8 +154,8 @@ jobs:
         with:
           timeout_minutes: 25
           max_attempts: ${{ matrix.retries }}
-          # run `make2` target inside each module's root
-          command: VERBOSE=1 make -C ${{ matrix.name }} ${{ matrix.make2 }}
+          # run test target inside each module's root
+          command: VERBOSE=1 make -C ${{ matrix.name }} test
       - name: Upload coverage report
         uses: codecov/codecov-action@v3
         with:
@@ -208,8 +189,6 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Build relic
-      run: make crypto_setup_gopath
     - name: Docker build
       run: make docker-build-flow docker-build-flow-corrupt
     - name: Run tests
@@ -235,7 +214,7 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Build relic and other tools
+    - name: install tools
       run: make install-tools
     - name: Install Flow Client In Docker
       # This proved to be more reliable than installing it locally.
diff --git a/.github/workflows/flaky-test-debug.yml b/.github/workflows/flaky-test-debug.yml
index 3a5b47e2c2f..3e5092c9f07 100644
--- a/.github/workflows/flaky-test-debug.yml
+++ b/.github/workflows/flaky-test-debug.yml
@@ -55,20 +55,6 @@ jobs:
           cache: true
       - name: Run tidy
         run: make tidy
-      - name: Emulator no relic check
-        run: make emulator-norelic-check
-
-  #  shell-check:
-  #    name: ShellCheck
-  #    runs-on: ubuntu-latest
-  #    steps:
-  #      - name: Checkout repo
-  #        uses: actions/checkout@v3
-  #      - name: Run ShellCheck
-  #        uses: ludeeus/action-shellcheck@203a3fd018dfe73f8ae7e3aa8da2c149a5f41c33
-  #        with:
-  #          scandir: './crypto'
-  #          ignore: 'relic'
 
   create-dynamic-test-matrix:
     name: Create Dynamic Test Matrix
diff --git a/Makefile b/Makefile
index d0a8fd10c23..6d9b2321bab 100644
--- a/Makefile
+++ b/Makefile
@@ -42,19 +42,11 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging
 export CONTAINER_REGISTRY := gcr.io/flow-container-registry
 export DOCKER_BUILDKIT := 1
 
-# setup the crypto package under the GOPATH: needed to test packages importing flow-go/crypto
-# TODO: replace by bash crypto_setup.sh after removing replace statements
-.PHONY: crypto_setup_gopath
-crypto_setup_gopath:
-	(cd ./crypto && make setup)
-	
-
-
 cmd/collection/collection:
 	go build -o cmd/collection/collection cmd/collection/main.go
 
 cmd/util/util:
-	go build -o cmd/util/util --tags relic cmd/util/main.go
+	go build -o cmd/util/util cmd/util/main.go
 
 .PHONY: update-core-contracts-version
 update-core-contracts-version:
@@ -65,8 +57,8 @@ update-core-contracts-version:
 
 .PHONY: unittest-main
 unittest-main:
-	# test all packages with Relic library enabled
-	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(GO_TEST_PACKAGES)
+	# test all packages
+	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES)
 
 .PHONY: install-mock-generators
 install-mock-generators:
@@ -88,15 +80,10 @@ verify-mocks: generate-mocks
 
 ############################################################################################
 
-.PHONY: emulator-norelic-check
-emulator-norelic-check:
-	# test the fvm package compiles with Relic library disabled (required for the emulator build)
-	cd ./fvm && go test ./... -run=NoTestHasThisPrefix
-
 .PHONY: fuzz-fvm
 fuzz-fvm:
 	# run fuzz tests in the fvm package
-	cd ./fvm && go test -fuzz=Fuzz -run ^$$ --tags relic
+	cd ./fvm && go test -fuzz=Fuzz -run ^$$
 
 .PHONY: test
 test: verify-mocks unittest-main
@@ -154,7 +141,7 @@ generate-mocks: install-mock-generators
 	mockery --name 'ProviderEngine' --dir=engine/execution/provider --case=underscore --output="engine/execution/provider/mock" --outpkg="mock"
 	(cd ./crypto && mockery --name 'PublicKey' --case=underscore --output="../module/mock" --outpkg="mock")
 	mockery --name '.*' --dir=state/cluster --case=underscore --output="state/cluster/mock" --outpkg="mock"
-	mockery --name '.*' --dir=module --case=underscore --tags="relic" --output="./module/mock" --outpkg="mock"
+	mockery --name '.*' --dir=module --case=underscore --output="./module/mock" --outpkg="mock"
 	mockery --name '.*' --dir=module/mempool --case=underscore --output="./module/mempool/mock" --outpkg="mempool"
 	mockery --name '.*' --dir=module/component --case=underscore --output="./module/component/mock" --outpkg="component"
 	mockery --name '.*' --dir=network --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork"
@@ -182,7 +169,7 @@ generate-mocks: install-mock-generators
 	mockery --name 'API' --dir="./engine/protocol" --case=underscore --output="./engine/protocol/mock" --outpkg="mock"
 	mockery --name 'API' --dir="./engine/access/state_stream" --case=underscore --output="./engine/access/state_stream/mock" --outpkg="mock"
 	mockery --name 'ConnectionFactory' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock"
-	mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --tags relic --output="./engine/execution/ingestion/mock" --outpkg="mock"
+	mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --output="./engine/execution/ingestion/mock" --outpkg="mock"
 	mockery --name '.*' --dir=model/fingerprint --case=underscore --output="./model/fingerprint/mock" --outpkg="mock"
 	mockery --name 'ExecForkActor' --structname 'ExecForkActorMock' --dir=module/mempool/consensus/mock/ --case=underscore --output="./module/mempool/consensus/mock/" --outpkg="mock"
 	mockery --name '.*' --dir=engine/verification/fetcher/ --case=underscore --output="./engine/verification/fetcher/mock" --outpkg="mockfetcher"
@@ -207,12 +194,12 @@ tidy:
 .PHONY: lint
 lint: tidy
 	# revive -config revive.toml -exclude storage/ledger/trie ./...
-	golangci-lint run -v --build-tags relic ./...
+	golangci-lint run -v ./...
 
 .PHONY: fix-lint
 fix-lint:
 	# revive -config revive.toml -exclude storage/ledger/trie ./...
-	golangci-lint run -v --build-tags relic --fix ./...
+	golangci-lint run -v --fix ./...
 
 # Runs unit tests with different list of packages as passed by CI so they run in parallel
 .PHONY: ci
@@ -242,7 +229,6 @@ docker-ci:
 # Runs integration tests in Docker  (for mac)
 .PHONY: docker-ci-integration
 docker-ci-integration:
-	rm -rf crypto/relic
 	docker run \
 		--env DOCKER_API_VERSION='1.39' \
 		--network host \
@@ -262,7 +248,7 @@ docker-build-collection:
 
 .PHONY: docker-build-collection-without-netgo
 docker-build-collection-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)"  .
@@ -281,7 +267,7 @@ docker-build-consensus:
 
 .PHONY: docker-build-consensus-without-netgo
 docker-build-consensus-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" .
@@ -300,7 +286,7 @@ docker-build-execution:
 
 .PHONY: docker-build-execution-without-netgo
 docker-build-execution-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" .
@@ -329,7 +315,7 @@ docker-build-verification:
 
 .PHONY: docker-build-verification-without-netgo
 docker-build-verification-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" .
@@ -358,7 +344,7 @@ docker-build-access:
 
 .PHONY: docker-build-access-without-netgo
 docker-build-access-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" .
@@ -387,7 +373,7 @@ docker-build-observer:
 
 .PHONY: docker-build-observer-without-netgo
 docker-build-observer-without-netgo:
-	docker build -f cmd/Dockerfile  --build-arg TAGS=relic --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" .
diff --git a/cmd/bootstrap/README.md b/cmd/bootstrap/README.md
index 9000f4d87f4..5fd2964faf5 100644
--- a/cmd/bootstrap/README.md
+++ b/cmd/bootstrap/README.md
@@ -46,7 +46,7 @@ _Each cluster_ of collector nodes needs to have its own root Block and root QC
 
 # Usage
 
-`go run -tags relic ./cmd/bootstrap` prints usage information
+`go run ./cmd/bootstrap` prints usage information
 
 ## Phase 1: Generate networking and staking keys for partner nodes:
 
@@ -65,7 +65,7 @@ If seeds are not provided, the CLI will try to use the system's pseudo-random nu
 
 #### Example
 ```bash
-go run -tags relic ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos
+go run ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos
 ```
 
 #### Generated output files
@@ -97,7 +97,7 @@ Each input is a config file specified as a command line parameter:
 
 #### Example
 ```bash
-go run -tags relic ./cmd/bootstrap finalize \
+go run ./cmd/bootstrap finalize \
  --fast-kg \
   --root-chain main \
   --root-height 0 \
@@ -153,7 +153,7 @@ go run -tags relic ./cmd/bootstrap finalize \
 This generates the networking key used by observers to connect to the public libp2p network. It is a different key format than staked nodes and should only be used for Observers.
 
 ```bash
-go run -tags relic ./cmd/bootstrap observer-network-key  -f ./path/network-key
+go run ./cmd/bootstrap observer-network-key  -f ./path/network-key
 ```
 
 This key must be kept secret as it's used to encrypt and sign network requests sent by the observers.
diff --git a/cmd/bootstrap/cmd/genconfig.go b/cmd/bootstrap/cmd/genconfig.go
index 404bd5e873e..ccf66104ecc 100644
--- a/cmd/bootstrap/cmd/genconfig.go
+++ b/cmd/bootstrap/cmd/genconfig.go
@@ -63,7 +63,7 @@ func genconfigCmdRun(_ *cobra.Command, _ []string) {
 var genconfigCmd = &cobra.Command{
 	Use:   "genconfig",
 	Short: "Generate node-config.json",
-	Long:  "example: go run -tags relic ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100",
+	Long:  "example: go run ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100",
 	Run:   genconfigCmdRun,
 }
 
diff --git a/insecure/Makefile b/insecure/Makefile
index 72a38cf4b4d..9872f01b1d8 100644
--- a/insecure/Makefile
+++ b/insecure/Makefile
@@ -11,4 +11,4 @@ endif
 # runs all unit tests of the insecure module
 .PHONY: test
 test:
-	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic ./...
+	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./...
diff --git a/integration/Makefile b/integration/Makefile
index a4f354c7e4d..7751b4ee333 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -22,53 +22,53 @@ ci-integration-test: access-tests ghost-tests mvp-tests epochs-tests consensus-t
 # Run unit tests for test utilities in this module
 .PHONY: test
 test:
-	go test $(if $(VERBOSE),-v,) -tags relic -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests`
+	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests`
 
 .PHONY: access-tests
 access-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/access/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/...
 
 .PHONY: collection-tests
 collection-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/collection/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/...
 
 .PHONY: consensus-tests
 consensus-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/consensus/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/...
 
 .PHONY: epochs-tests
 epochs-tests:
 	# Use a higher timeout of 20m for the suite of tests which span full epochs
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 30m ./tests/epochs/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/...
 
 .PHONY: ghost-tests
 ghost-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/ghost/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/...
 
 .PHONY: mvp-tests
 mvp-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/mvp/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/...
 
 .PHONY: execution-tests
 execution-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/execution/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/...
 
 .PHONY: verification-tests
 verification-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/verification/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/...
 
 .PHONY: upgrades-tests
 upgrades-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/upgrades/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/...
 
 .PHONY: network-tests
 network-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/network/...
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/...
 
 # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel
 .PHONY: bft-tests
 bft-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/... -p 1
+	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1
 
 
 ############################################################################################
diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile
index 1ad38985a43..8d474efd3dc 100644
--- a/integration/benchmark/cmd/manual/Dockerfile
+++ b/integration/benchmark/cmd/manual/Dockerfile
@@ -4,20 +4,11 @@
 FROM golang:1.19-buster AS build-setup
 
 RUN apt-get update
-RUN apt-get -y install cmake zip
-
-## (1) Build Relic first to maximize caching
-FROM build-setup AS build-relic
+RUN apt-get -y install zip
 
 RUN mkdir /build
 WORKDIR /build
 
-# Copy over the crypto package
-COPY crypto ./crypto
-
-# Build Relic (this places build artifacts in /build/relic/build)
-RUN cd ./crypto/ && go generate
-
 ## (2) Build the app binary
 FROM build-setup AS build-env
 
@@ -35,9 +26,6 @@ ARG TARGET
 
 COPY . .
 
-# Copy over Relic build artifacts
-COPY --from=build-relic /build/crypto/relic/build ./crypto/relic/build
-
 FROM build-env as build-production
 WORKDIR /app
 
@@ -48,7 +36,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
     --mount=type=ssh \
     cd integration && \
-    CGO_ENABLED=1 go build --tags relic -ldflags "-extldflags -static" -o ./app ./${TARGET}
+    CGO_ENABLED=1 go build -ldflags "-extldflags -static" -o ./app ./${TARGET}
 
 RUN mv /app/integration/app /app/app
 
diff --git a/integration/benchnet2/Makefile b/integration/benchnet2/Makefile
index 62859fbf74c..f1979c0f1b4 100644
--- a/integration/benchnet2/Makefile
+++ b/integration/benchnet2/Makefile
@@ -29,12 +29,12 @@ endif
 # for the checked out version will be run in the sub folder but the bootstrap folder will be created here (outside of the checked out flow-go in the sub folder)
 gen-bootstrap: clone-flow
 	cd flow-go && make crypto_setup_gopath
-	cd flow-go/cmd/bootstrap && go run -tags relic . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json
-	cd flow-go/cmd/bootstrap && go run -tags relic . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys
+	cd flow-go/cmd/bootstrap && go run . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json
+	cd flow-go/cmd/bootstrap && go run . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys
 	echo {} > ./bootstrap/conf/partner-stakes.json
 	mkdir ./bootstrap/partner-nodes
-	cd flow-go/cmd/bootstrap && go run -tags relic . rootblock  --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --fast-kg --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information
-	cd flow-go/cmd/bootstrap && go run -tags relic . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000
+	cd flow-go/cmd/bootstrap && go run . rootblock  --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --fast-kg --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information
+	cd flow-go/cmd/bootstrap && go run . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000
 
 gen-helm-l1:
 	go run automate/cmd/level1/bootstrap.go --data bootstrap/public-root-information/root-protocol-state-snapshot.json --dockerTag $(NETWORK_ID) --dockerRegistry $(DOCKER_REGISTRY)
diff --git a/integration/localnet/Makefile b/integration/localnet/Makefile
index f35cb0643e0..ac548916ae5 100644
--- a/integration/localnet/Makefile
+++ b/integration/localnet/Makefile
@@ -43,7 +43,7 @@ ifeq ($(strip $(VALID_EXECUTION)), 1)
 else ifeq ($(strip $(VALID_CONSENSUS)), 1)
 	$(error Number of Consensus nodes should be no less than 2)
 else
-	go run -tags relic \
+	go run \
 		-ldflags="-X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' \
 		-X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
 		builder/*.go \
@@ -119,15 +119,15 @@ stop:
 
 .PHONY: load
 load:
-	go run --tags relic ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s
+	go run ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s
 
 .PHONY: tps-ci-smoke
 tps-ci-smoke:
-	go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false
+	go run ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false
 
 .PHONY: tps-ci
 tps-ci: bootstrap-ci build-flow start-flow
-	go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION)
+	go run ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION)
 
 .PHONY: clean-data
 clean-data:
diff --git a/module/metrics/example/README.md b/module/metrics/example/README.md
index f693cac0780..ec319414ad8 100644
--- a/module/metrics/example/README.md
+++ b/module/metrics/example/README.md
@@ -18,7 +18,7 @@ You can choose one of the following:
     Note: Running example with `-happypath` flag examines the metrics collection on a real
     happy path of verification node. 
     ```
-    go run --tags=relic module/metrics/example/verification/main.go
+    go run module/metrics/example/verification/main.go
     ```
  - Consensus Node: 
     ```
diff --git a/tools/test_monitor/level1/process_summary1_results_test.go b/tools/test_monitor/level1/process_summary1_results_test.go
index c64f8442995..6e7b12f0551 100644
--- a/tools/test_monitor/level1/process_summary1_results_test.go
+++ b/tools/test_monitor/level1/process_summary1_results_test.go
@@ -33,19 +33,19 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) {
 			RawJSONTestRunFile:    "test-result-crypto-hash-1-count-skip-pass.json",
 		},
 
-		// raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/...
+		// raw results generated with: go test -json -count 1 ./utils/unittest/...
 		"2 count all pass": {
 			ExpectedLevel1Summary: testdata.GetTestData_Level1_2CountPass(),
 			RawJSONTestRunFile:    "test-result-crypto-hash-2-count-pass.json",
 		},
 
-		// raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/...
+		// raw results generated with: go test -json -count 1 ./utils/unittest/...
 		"10 count all pass": {
 			ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountPass(),
 			RawJSONTestRunFile:    "test-result-crypto-hash-10-count-pass.json",
 		},
 
-		// raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/...
+		// raw results generated with: go test -json -count 1 ./utils/unittest/...
 		"10 count some failures": {
 			ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountSomeFailures(),
 			RawJSONTestRunFile:    "test-result-crypto-hash-10-count-fail.json",
@@ -54,14 +54,14 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) {
 		// no result tests - tests below don't generate pass/fail result due to `go test` bug
 		// with using `fmt.printf("log message")` without newline `\n`
 
-		// raw results generated with: go test -v -tags relic -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack
+		// raw results generated with: go test -v -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack
 		// this is a single unit test that produces a no result
 		"1 count single no result test": {
 			ExpectedLevel1Summary: testdata.GetTestData_Level1_1CountSingleExceptionTest(),
 			RawJSONTestRunFile:    "test-result-exception-single-1-count-pass.json",
 		},
 
-		//raw results generated with: go test -v -tags relic -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack
+		//raw results generated with: go test -v -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack
 		//multiple no result tests in a row
 		"5 no result tests in a row": {
 			ExpectedLevel1Summary: testdata.GetTestData_Level1_5CountSingleExceptionTest(),
@@ -74,7 +74,7 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) {
 			RawJSONTestRunFile:    "test-result-exception-single-5-count-4-nil-1-normal-pass.json",
 		},
 
-		// raw results generated with: go test -v -tags relic -count=3 -json ./model/encodable/.
+		// raw results generated with: go test -v -count=3 -json ./model/encodable/.
 		// group of unit tests with a single no result test
 		"3 count no result test with normal tests": {
 			ExpectedLevel1Summary: testdata.GetTestData_Leve1_3CountExceptionWithNormalTests(),
diff --git a/utils/binstat/binstat_external_test.go b/utils/binstat/binstat_external_test.go
index 9ffa7b23065..10f8b911ff9 100644
--- a/utils/binstat/binstat_external_test.go
+++ b/utils/binstat/binstat_external_test.go
@@ -28,7 +28,7 @@ import (
  * 5. Strip "time" field from JSON log line output for shorter read, and
  * 6. Show the amount of code coverage from the tests.
  *
- * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic --tags relic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd
+ * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd
  */
 
 /*

From 4eabaf1801d35d771dffb30c8ef8002b39df2c56 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 18:46:26 -0600
Subject: [PATCH 111/200] remove relic tags from go files

---
 .../verification/combined_verifier_v2.go      |  3 --
 .../verification/combined_verifier_v3.go      |  3 --
 .../hotstuff/verification/staking_verifier.go |  3 --
 crypto_setup.sh                               | 32 -----------------
 .../computation/computer/spock_norelic.go     | 26 --------------
 .../computation/computer/spock_relic.go       |  3 --
 module/dkg_broker.go                          |  3 --
 module/signature/aggregation.go               |  3 --
 module/signature/aggregation_no_relic.go      | 34 -------------------
 module/signature/aggregation_test.go          |  3 --
 10 files changed, 113 deletions(-)
 delete mode 100644 crypto_setup.sh
 delete mode 100644 engine/execution/computation/computer/spock_norelic.go
 delete mode 100644 module/signature/aggregation_no_relic.go

diff --git a/consensus/hotstuff/verification/combined_verifier_v2.go b/consensus/hotstuff/verification/combined_verifier_v2.go
index ee67a4ea36a..560cb1f8ece 100644
--- a/consensus/hotstuff/verification/combined_verifier_v2.go
+++ b/consensus/hotstuff/verification/combined_verifier_v2.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package verification
 
 import (
diff --git a/consensus/hotstuff/verification/combined_verifier_v3.go b/consensus/hotstuff/verification/combined_verifier_v3.go
index 8f5f9acd8f0..39af088ae0d 100644
--- a/consensus/hotstuff/verification/combined_verifier_v3.go
+++ b/consensus/hotstuff/verification/combined_verifier_v3.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package verification
 
 import (
diff --git a/consensus/hotstuff/verification/staking_verifier.go b/consensus/hotstuff/verification/staking_verifier.go
index 60b2f45f4d5..ecd5013f171 100644
--- a/consensus/hotstuff/verification/staking_verifier.go
+++ b/consensus/hotstuff/verification/staking_verifier.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package verification
 
 import (
diff --git a/crypto_setup.sh b/crypto_setup.sh
deleted file mode 100644
index e9789c74a23..00000000000
--- a/crypto_setup.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-
-#!/bin/bash
-
-# crypto package 
-PKG_NAME="github.com/onflow/flow-go/crypto"
-
-# go.mod
-MOD_FILE="./go.mod"
-
-# the version of onflow/flow-go/crypto used in the project is read from the go.mod file
-if [ -f "${MOD_FILE}" ]
-then
-    # extract the imported version
-    VERSION="$(go list -f '{{.Version}}' -m ${PKG_NAME})"
-    # go get the package
-    go get "${PKG_NAME}@${VERSION}" || { echo "go get the package failed"; exit 1; }
-    # using the right version, get the package directory path
-    PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}"
-else 
-   { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; }
-fi
-
-# grant permissions if not existant
-if [[ ! -r ${PKG_DIR}  || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then
-   chmod -R 755 "${PKG_DIR}"
-fi
-
-# get into the package directory and set up the external dependencies
-(
-    cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; }
-    go generate
-)
diff --git a/engine/execution/computation/computer/spock_norelic.go b/engine/execution/computation/computer/spock_norelic.go
deleted file mode 100644
index 81678d94f33..00000000000
--- a/engine/execution/computation/computer/spock_norelic.go
+++ /dev/null
@@ -1,26 +0,0 @@
-//go:build !relic
-// +build !relic
-
-package computer
-
-import (
-	"github.com/onflow/flow-go/crypto"
-	"github.com/onflow/flow-go/crypto/hash"
-)
-
-// This is a temporary wrapper that simulates a call to SPoCK prove,
-// required for the emulator build. The function is never called by the
-// emulator although it is required for a successful build.
-//
-// TODO(tarak): remove once the crypto module properly implements a non-relic
-// version of SPOCKProve.
-func SPOCKProve(
-	sk crypto.PrivateKey,
-	data []byte,
-	kmac hash.Hasher,
-) (
-	crypto.Signature,
-	error,
-) {
-	panic("SPoCK prove not supported when flow-go is built without relic")
-}
diff --git a/engine/execution/computation/computer/spock_relic.go b/engine/execution/computation/computer/spock_relic.go
index 89a8182ba8f..0fcb835adcd 100644
--- a/engine/execution/computation/computer/spock_relic.go
+++ b/engine/execution/computation/computer/spock_relic.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package computer
 
 import (
diff --git a/module/dkg_broker.go b/module/dkg_broker.go
index 49ebb0ad051..7e64353816e 100644
--- a/module/dkg_broker.go
+++ b/module/dkg_broker.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package module
 
 import (
diff --git a/module/signature/aggregation.go b/module/signature/aggregation.go
index 99129c656dc..76101ee3805 100644
--- a/module/signature/aggregation.go
+++ b/module/signature/aggregation.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package signature
 
 import (
diff --git a/module/signature/aggregation_no_relic.go b/module/signature/aggregation_no_relic.go
deleted file mode 100644
index 6b51c6f35a3..00000000000
--- a/module/signature/aggregation_no_relic.go
+++ /dev/null
@@ -1,34 +0,0 @@
-//go:build !relic
-// +build !relic
-
-package signature
-
-import (
-	"github.com/onflow/flow-go/crypto"
-)
-
-const panic_relic = "function only supported with the relic build tag"
-
-// These functions are the non-relic versions of some public functions from the package.
-// The functions are here to allow the build of flow-emulator, since the emulator is built
-// without the "relic" build tag, and does not run the functions below.
-type SignatureAggregatorSameMessage struct{}
-
-func NewSignatureAggregatorSameMessage(
-	message []byte,
-	dsTag string,
-	publicKeys []crypto.PublicKey,
-) (*SignatureAggregatorSameMessage, error) {
-	panic(panic_relic)
-}
-
-func (s *SignatureAggregatorSameMessage) Verify(signer int, sig crypto.Signature) (bool, error) {
-	panic(panic_relic)
-}
-func (s *SignatureAggregatorSameMessage) TrustedAdd(signer int, sig crypto.Signature) error {
-	panic(panic_relic)
-}
-
-func (s *SignatureAggregatorSameMessage) Aggregate() ([]int, crypto.Signature, error) {
-	panic(panic_relic)
-}
diff --git a/module/signature/aggregation_test.go b/module/signature/aggregation_test.go
index aacd0a89f06..4291a7d5734 100644
--- a/module/signature/aggregation_test.go
+++ b/module/signature/aggregation_test.go
@@ -1,6 +1,3 @@
-//go:build relic
-// +build relic
-
 package signature
 
 import (

From 9b17f933ffeeefeefb27e1de7255d149b186861a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 24 May 2023 19:07:36 -0600
Subject: [PATCH 112/200] remove more relic related tags and code

---
 .github/workflows/bench.yml                   |  3 ---
 .github/workflows/cd.yml                      |  2 --
 .github/workflows/flaky-test-debug.yml        |  5 -----
 .github/workflows/tools.yml                   |  2 --
 cmd/Dockerfile                                |  4 ++--
 .../computation/computer/result_collector.go  |  2 +-
 .../computation/computer/spock_relic.go       | 21 -------------------
 7 files changed, 3 insertions(+), 36 deletions(-)
 delete mode 100644 engine/execution/computation/computer/spock_relic.go

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 7c3c6d896bd..b57bbbd440d 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -44,9 +44,6 @@ jobs:
           go-version: "1.19"
           cache: true
 
-      - name: Build relic
-        run: make crypto_setup_gopath
-
       - name: Run benchmark on current branch
         run: |
           (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee new.txt
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
index eb28e840078..962242cb888 100644
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@@ -17,8 +17,6 @@ jobs:
         go-version: '1.19'
     - name: Checkout repo
       uses: actions/checkout@v2
-    - name: Build relic
-      run: make crypto_setup_gopath
     # Provide Google Service Account credentials to Github Action, allowing interaction with the Google Container Registry
     # Logging in as github-actions@dl-flow.iam.gserviceaccount.com
     - name: Docker login
diff --git a/.github/workflows/flaky-test-debug.yml b/.github/workflows/flaky-test-debug.yml
index 3e5092c9f07..722b9ed2f4e 100644
--- a/.github/workflows/flaky-test-debug.yml
+++ b/.github/workflows/flaky-test-debug.yml
@@ -27,8 +27,6 @@ jobs:
         with:
           go-version: ${{ env.GO_VERSION }}
           cache: true
-      - name: Build relic
-        run: make crypto_setup_gopath
       - name: Run go generate
         run: go generate
         working-directory: ${{ matrix.dir }}
@@ -37,7 +35,6 @@ jobs:
         with:
           # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version.
           version: v1.49
-          args: -v --build-tags relic
           working-directory: ${{ matrix.dir }}
           # https://github.com/golangci/golangci-lint-action/issues/244
           skip-cache: true
@@ -192,8 +189,6 @@ jobs:
         with:
           go-version: ${{ env.GO_VERSION }}
           cache: true
-      - name: Build relic
-        run: make crypto_setup_gopath
       - name: Docker build
         run: make docker-build-flow docker-build-flow-corrupt
       - name: Run tests
diff --git a/.github/workflows/tools.yml b/.github/workflows/tools.yml
index 2e297adb6ff..2cd9ee447a8 100644
--- a/.github/workflows/tools.yml
+++ b/.github/workflows/tools.yml
@@ -34,8 +34,6 @@ jobs:
       uses: actions/checkout@v2
       with:
         ref: ${{ inputs.tag }}
-    - name: Build relic
-      run: make crypto_setup_gopath
     - name: Build and upload boot-tools
       run: |
         make tool-bootstrap tool-transit
diff --git a/cmd/Dockerfile b/cmd/Dockerfile
index fc4bcf7badb..4e38b48432f 100644
--- a/cmd/Dockerfile
+++ b/cmd/Dockerfile
@@ -36,7 +36,7 @@ WORKDIR /app
 ARG GOARCH=amd64
 
 # TAGS can be overriden to modify the go build tags (e.g. build without netgo)
-ARG TAGS="relic,netgo"
+ARG TAGS="netgo"
 
 # Keep Go's build cache between builds.
 # https://github.com/golang/go/issues/27719#issuecomment-514747274
@@ -64,7 +64,7 @@ ARG GOARCH=amd64
 RUN --mount=type=ssh \
     --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=1 GOOS=linux go build --tags "relic,netgo" -ldflags "-extldflags -static \
+    CGO_ENABLED=1 GOOS=linux go build --tags "netgo" -ldflags "-extldflags -static \
     -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
     -gcflags="all=-N -l" -o ./app ${TARGET}
 
diff --git a/engine/execution/computation/computer/result_collector.go b/engine/execution/computation/computer/result_collector.go
index dd6a6f90ade..4102d19efb3 100644
--- a/engine/execution/computation/computer/result_collector.go
+++ b/engine/execution/computation/computer/result_collector.go
@@ -171,7 +171,7 @@ func (collector *resultCollector) commitCollection(
 	spock, err := collector.signer.SignFunc(
 		collectionExecutionSnapshot.SpockSecret,
 		collector.spockHasher,
-		SPOCKProve)
+		crypto.SPOCKProve)
 	if err != nil {
 		return fmt.Errorf("signing spock hash failed: %w", err)
 	}
diff --git a/engine/execution/computation/computer/spock_relic.go b/engine/execution/computation/computer/spock_relic.go
deleted file mode 100644
index 0fcb835adcd..00000000000
--- a/engine/execution/computation/computer/spock_relic.go
+++ /dev/null
@@ -1,21 +0,0 @@
-package computer
-
-import (
-	"github.com/onflow/flow-go/crypto"
-	"github.com/onflow/flow-go/crypto/hash"
-)
-
-// This is a temporary wrapper that around the crypto library.
-//
-// TODO(tarak): remove once the crypto module properly implements a non-relic
-// version of SPOCKProve.
-func SPOCKProve(
-	sk crypto.PrivateKey,
-	data []byte,
-	kmac hash.Hasher,
-) (
-	crypto.Signature,
-	error,
-) {
-	return crypto.SPOCKProve(sk, data, kmac)
-}

From 57215468aab0d425bef4a5f84b92de9a977563de Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 25 May 2023 00:31:16 -0600
Subject: [PATCH 113/200] remove crypto_setup_gopath

---
 Makefile                        | 4 ++--
 cmd/Dockerfile                  | 3 +--
 integration/benchnet2/Makefile  | 1 -
 tools/test_monitor/run-tests.sh | 5 +----
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 6d9b2321bab..cd402f40f1e 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ install-mock-generators:
     go install github.com/golang/mock/mockgen@v1.6.0;
 
 .PHONY: install-tools
-install-tools: crypto_setup_gopath check-go-version install-mock-generators
+install-tools: check-go-version install-mock-generators
 	cd ${GOPATH}; \
 	go install github.com/golang/protobuf/protoc-gen-go@v1.3.2; \
 	go install github.com/uber/prototool/cmd/prototool@v1.9.0; \
@@ -207,7 +207,7 @@ ci: install-tools test
 
 # Runs integration tests
 .PHONY: ci-integration
-ci-integration: crypto_setup_gopath
+ci-integration:
 	$(MAKE) -C integration ci-integration-test
 
 # Runs benchmark tests
diff --git a/cmd/Dockerfile b/cmd/Dockerfile
index 4e38b48432f..5dbde25bfb4 100644
--- a/cmd/Dockerfile
+++ b/cmd/Dockerfile
@@ -25,8 +25,7 @@ COPY . .
 
 RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    --mount=type=secret,id=git_creds,dst=/root/.netrc \
-    make crypto_setup_gopath
+    --mount=type=secret,id=git_creds,dst=/root/.netrc
 
 ####################################
 ## (3) Build the production app binary
diff --git a/integration/benchnet2/Makefile b/integration/benchnet2/Makefile
index f1979c0f1b4..73364e104c2 100644
--- a/integration/benchnet2/Makefile
+++ b/integration/benchnet2/Makefile
@@ -28,7 +28,6 @@ endif
 # assumes there is a checked out version of flow-go in a "flow-go" sub-folder at this level so that the bootstrap executable
 # for the checked out version will be run in the sub folder but the bootstrap folder will be created here (outside of the checked out flow-go in the sub folder)
 gen-bootstrap: clone-flow
-	cd flow-go && make crypto_setup_gopath
 	cd flow-go/cmd/bootstrap && go run . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json
 	cd flow-go/cmd/bootstrap && go run . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys
 	echo {} > ./bootstrap/conf/partner-stakes.json
diff --git a/tools/test_monitor/run-tests.sh b/tools/test_monitor/run-tests.sh
index 0cbf1383b19..c30085ffc21 100755
--- a/tools/test_monitor/run-tests.sh
+++ b/tools/test_monitor/run-tests.sh
@@ -23,7 +23,6 @@ then
     fi
 
   echo "preparing $TEST_CATEGORY tests">&2
-  make crypto_setup_gopath
   make docker-build-flow docker-build-flow-corrupt
   echo "running $TEST_CATEGORY tests">&2
   make -C integration -s ${BASH_REMATCH[1]}-tests > test-output
@@ -37,10 +36,8 @@ else
           make -s unittest-main > test-output
         ;;
         unit-crypto)
-          echo "preparing crypto unit tests">&2
-          make -C crypto setup
           echo "running crypto unit tests">&2
-          make -C crypto -s unittest > test-output
+          make -C crypto -s test > test-output
         ;;
         unit-insecure)
           echo "preparing insecure unit tests">&2

From 06572e347c798177e8a46b7289bf412b568c9353 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 25 May 2023 00:48:18 -0600
Subject: [PATCH 114/200] update go generate

---
 crypto/common.go | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/crypto/common.go b/crypto/common.go
index 7e460cbf6d2..b9e072c9930 100644
--- a/crypto/common.go
+++ b/crypto/common.go
@@ -8,9 +8,6 @@ import (
 
 //revive:disable:var-naming
 
-// the `go generate` command requires bash scripting, `cmake` and `git`.
-//go:generate bash ./build_dependency.sh
-
 const (
 	// Minimum targeted bits of security.
 	// This is used as a reference but it doesn't mean all implemented primitives provide this minimum.

From 33c5e0e266a64d8b6d5a90624e469a3238d553e9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 26 May 2023 13:55:43 -0600
Subject: [PATCH 115/200] remove cmake install from dockerfile

---
 cmd/Dockerfile    | 2 +-
 crypto/Dockerfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/Dockerfile b/cmd/Dockerfile
index 5dbde25bfb4..90075485922 100644
--- a/cmd/Dockerfile
+++ b/cmd/Dockerfile
@@ -6,7 +6,7 @@
 FROM golang:1.19-bullseye AS build-setup
 
 RUN apt-get update
-RUN apt-get -y install cmake zip
+RUN apt-get -y install zip
 
 ## (2) Setup crypto dependencies
 FROM build-setup AS build-env
diff --git a/crypto/Dockerfile b/crypto/Dockerfile
index 37a0b373171..7566ea751b3 100644
--- a/crypto/Dockerfile
+++ b/crypto/Dockerfile
@@ -2,7 +2,7 @@
 
 FROM golang:1.19-buster
 RUN apt-get update
-RUN apt-get -y install cmake zip
+RUN apt-get -y install zip
 RUN go install github.com/axw/gocov/gocov@latest
 RUN go install github.com/matm/gocov-html@latest
 WORKDIR /go/src/flow

From 83f42fb5d90758fade9ea721b0719d9c4f6aa581 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 26 May 2023 18:08:58 -0600
Subject: [PATCH 116/200] clean up header files in blst_include.h

---
 crypto/blst_include.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index e408c9c0c70..89966463c61 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -4,12 +4,10 @@
 // extra tools to use BLST low level that are needed by the Flow crypto library
 // eventually this file would replace blst.h
 
-#include "bls12381_utils.h"
 #include "point.h"
 #include "fields.h"
 #include "consts.h"
 #include "errors.h"
-#include "sha256.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
@@ -66,8 +64,8 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
 // are represented as a little endian vector of limbs.
 // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements).
 // `Fr` is defined as a struct to be exportable through cgo to the Go layer.
-#define R_BITS 255
-typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS
+#define R_BITS 255 // equal to Fr_bits in bls12381_utils.h
+typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; 
 
 // field elements F_p
 // F_p elements are represented as big numbers reduced modulo `p`. Big numbers

From ec2ceb400891ae67bbc9deb38835901bda359513 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 26 May 2023 19:18:38 -0600
Subject: [PATCH 117/200] update boolean usage from bool_t to C type bool

---
 crypto/bls.go                      |  2 +-
 crypto/bls12381_utils.c            | 28 ++++++++--------
 crypto/bls12381_utils.go           | 16 ++++-----
 crypto/bls12381_utils.h            | 27 ++++++++-------
 crypto/bls_thresholdsign_core.c    |  6 ++--
 crypto/bls_thresholdsign_include.h |  2 +-
 crypto/blst_include.h              | 54 +++---------------------------
 crypto/dkg_core.c                  |  2 +-
 crypto/dkg_feldmanvss.go           |  4 +--
 crypto/dkg_feldmanvssq.go          |  5 +--
 crypto/dkg_include.h               |  2 +-
 11 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index c8650c9dc60..3206a29cdf9 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -347,7 +347,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 	}
 
 	// membership check in G2
-	if C.E2_in_G2((*C.E2)(&pk.point)) == (C.ulonglong)(0) {
+	if !bool(C.E2_in_G2((*C.E2)(&pk.point))) {
 		return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group")
 	}
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index a6b1e5c5e44..07224cd4242 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -38,12 +38,12 @@ const Fr BLS12_381_rR = {{  \
     }};
 
 // returns true if a == 0 and false otherwise
-bool_t Fr_is_zero(const Fr* a) {
+bool Fr_is_zero(const Fr* a) {
     return bytes_are_zero((const byte*)a, sizeof(Fr));
 }
 
 // returns true if a == b and false otherwise
-bool_t Fr_is_equal(const Fr* a, const Fr* b) {
+bool Fr_is_equal(const Fr* a, const Fr* b) {
     return vec_is_equal(a, b, sizeof(Fr));
 }
 
@@ -265,7 +265,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
 // Reads a scalar from an array and maps it to Fr using modular reduction.
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
-bool_t map_bytes_to_Fr(Fr* a, const byte* bin, int len) {
+bool map_bytes_to_Fr(Fr* a, const byte* bin, int len) {
     Fr_from_be_bytes(a, bin, len);
     return Fr_is_zero(a);
 }
@@ -311,7 +311,7 @@ static void Fp_neg(Fp *res, const Fp *a) {
 // The boolean output is valid whether `a` is in Montgomery form or not,
 // since montgomery constant `R` is a quadratic residue.
 // However, the square root is valid only if `a` is in montgomery form.
-static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) {
+static bool Fp_sqrt_montg(Fp *res, const Fp* a) {
    return sqrt_fp((limb_t*)res, (limb_t*)a);
 }
 
@@ -415,7 +415,7 @@ static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
 // The boolean output is valid whether `a` is in Montgomery form or not,
 // since montgomery constant `R` is a quadratic residue.
 // However, the square root is valid only if `a` is in montgomery form.
-static bool_t Fp2_sqrt_montg(Fp2 *res, const Fp2* a) {
+static bool Fp2_sqrt_montg(Fp2 *res, const Fp2* a) {
    return sqrt_fp2((vec384*)res, (vec384*)a);
 }
 
@@ -466,13 +466,13 @@ void E1_copy(E1* res, const E1* p) {
 }
 
 // checks p1 == p2
-bool_t E1_is_equal(const E1* p1, const E1* p2) {
+bool E1_is_equal(const E1* p1, const E1* p2) {
     // `POINTonE1_is_equal` includes the infinity case
     return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2);
 }
 
 // compare p to infinity
-bool_t E1_is_infty(const E1* p) {
+bool E1_is_infty(const E1* p) {
     // BLST infinity points are defined by Z=0
     return vec_is_zero(p->z, sizeof(p->z));  
 }
@@ -495,14 +495,14 @@ void E1_to_affine(E1* res, const E1* p) {
 }
 
 // checks affine point `p` is in E1
-bool_t E1_affine_on_curve(const E1* p) {
+bool E1_affine_on_curve(const E1* p) {
     // BLST's `POINTonE1_affine_on_curve` does not include the inifity case!
     return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p);
 }
 
 // checks if input E1 point is on the subgroup G1.
 // It assumes input `p` is on E1.
-bool_t E1_in_G1(const E1* p){
+bool E1_in_G1(const E1* p){
     // currently uses Scott method
     return POINTonE1_in_G1((const POINTonE1*)p);
 }
@@ -859,19 +859,19 @@ void E2_set_infty(E2* p) {
 }
 
 // check if `p` is infinity
-bool_t E2_is_infty(const E2* p) {
+bool E2_is_infty(const E2* p) {
     // BLST infinity points are defined by Z=0
     return vec_is_zero(p->z, sizeof(p->z));
 }
 
 // checks affine point `p` is in E2
-bool_t E2_affine_on_curve(const E2* p) {
+bool E2_affine_on_curve(const E2* p) {
     // BLST's `POINTonE2_affine_on_curve` does not include the infinity case!
     return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p);
 }
 
 // checks p1 == p2
-bool_t E2_is_equal(const E2* p1, const E2* p2) {
+bool E2_is_equal(const E2* p1, const E2* p2) {
     // `POINTonE2_is_equal` includes the infinity case
     return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2);
 }
@@ -935,7 +935,7 @@ void G2_mult_gen(E2* res, const Fr* expo) {
 
 // checks if input E2 point is on the subgroup G2.
 // It assumes input `p` is on E2.
-bool_t E2_in_G2(const E2* p){
+bool E2_in_G2(const E2* p){
     // currently uses Scott method
     return POINTonE2_in_G2((const POINTonE2*)p);
 }
@@ -1084,7 +1084,7 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
 
 // ------------------- Pairing utilities 
 
-bool_t Fp12_is_one(Fp12 *a) {
+bool Fp12_is_one(Fp12 *a) {
     return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12));
 }
 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 9695d45aba2..94083cf9abe 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -89,28 +89,28 @@ func generatorScalarMultG2(res *pointE2, expo *scalar) {
 // comparison in Fr where r is the group order of G1/G2
 // (both scalars should be reduced mod r)
 func (x *scalar) equals(other *scalar) bool {
-	return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0
+	return bool(C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)))
 }
 
 // comparison in E1
 func (p *pointE1) equals(other *pointE1) bool {
-	return C.E1_is_equal((*C.E1)(p), (*C.E1)(other)) != 0
+	return bool(C.E1_is_equal((*C.E1)(p), (*C.E1)(other)))
 }
 
 // comparison in E2
 func (p *pointE2) equals(other *pointE2) bool {
-	return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0
+	return bool(C.E2_is_equal((*C.E2)(p), (*C.E2)(other)))
 }
 
 // Comparison to zero in Fr.
 // Scalar must be already reduced modulo r
 func (x *scalar) isZero() bool {
-	return C.Fr_is_zero((*C.Fr)(x)) != 0
+	return bool(C.Fr_is_zero((*C.Fr)(x)))
 }
 
 // Comparison to point at infinity in G2.
 func (p *pointE2) isInfinity() bool {
-	return C.E2_is_infty((*C.E2)(p)) != 0
+	return bool(C.E2_is_infty((*C.E2)(p)))
 }
 
 // generates a random element in F_r using input random source,
@@ -142,7 +142,7 @@ func mapToFr(x *scalar, src []byte) bool {
 	isZero := C.map_bytes_to_Fr((*C.Fr)(x),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
-	return isZero != (C.ulonglong)(0)
+	return bool(isZero)
 }
 
 // writeScalar writes a scalar in a slice of bytes
@@ -231,13 +231,13 @@ func readPointE1(a *pointE1, src []byte) error {
 // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used
 // in go test files.
 func checkMembershipG1(pt *pointE1) bool {
-	return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0)
+	return bool(C.E1_in_G1((*C.E1)(pt)))
 }
 
 // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used
 // in go test files.
 func checkMembershipG2(pt *pointE2) bool {
-	return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0)
+	return bool(C.E2_in_G2((*C.E2)(pt)))
 }
 
 // This is only a TEST/DEBUG/BENCH function.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index d2f2d8b489f..3e8ca1b06ea 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -8,6 +8,9 @@
 #include <string.h>
 #include "blst_include.h"
 
+typedef uint8_t byte;
+typedef _Bool bool;  // assuming cgo is using a modern enough compiler
+
 #define SEC_BITS  128
 #define VALID     0
 #define INVALID   1
@@ -50,8 +53,8 @@ int     bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 
 // Fr utilities
 extern const Fr BLS12_381_rR;
-bool_t      Fr_is_zero(const Fr* a);
-bool_t      Fr_is_equal(const Fr* a, const Fr* b);
+bool      Fr_is_zero(const Fr* a);
+bool      Fr_is_equal(const Fr* a, const Fr* b);
 void        Fr_set_limb(Fr*, const limb_t);
 void        Fr_copy(Fr*, const Fr*);
 void        Fr_set_zero(Fr*);
@@ -69,7 +72,7 @@ void        Fr_inv_exp_montg(Fr *res, const Fr *a);
 BLST_ERROR  Fr_read_bytes(Fr* a, const byte *bin, int len);
 BLST_ERROR  Fr_star_read_bytes(Fr* a, const byte *bin, int len);
 void        Fr_write_bytes(byte *bin, const Fr* a);
-bool_t      map_bytes_to_Fr(Fr*, const byte*, int);
+bool      map_bytes_to_Fr(Fr*, const byte*, int);
 
 // Fp utilities
 void        Fp_mul_montg(Fp *, const Fp *, const Fp *);
@@ -77,12 +80,12 @@ void        Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
 void        E1_copy(E1*, const E1*);
-bool_t      E1_is_equal(const E1*, const E1*);
+bool      E1_is_equal(const E1*, const E1*);
 void        E1_set_infty(E1*);
-bool_t      E1_is_infty(const E1*);
+bool      E1_is_infty(const E1*);
 void        E1_to_affine(E1*, const E1*);
-bool_t      E1_affine_on_curve(const E1*);
-bool_t      E1_in_G1(const E1*);
+bool      E1_affine_on_curve(const E1*);
+bool     E1_in_G1(const E1*);
 void        E1_mult(E1*, const E1*, const Fr*);
 void        E1_add(E1*, const E1*, const E1*);
 void        E1_neg(E1*, const E1*);
@@ -99,9 +102,9 @@ int         map_to_G1(E1*, const byte*, const int);
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
-bool_t      E2_is_infty(const E2*);
-bool_t      E2_affine_on_curve(const E2*);
-bool_t      E2_is_equal(const E2*, const E2*);
+bool      E2_is_infty(const E2*);
+bool      E2_affine_on_curve(const E2*);
+bool      E2_is_equal(const E2*, const E2*);
 void        E2_copy(E2*, const E2*);
 void        E2_to_affine(E2*, const E2*);
 BLST_ERROR  E2_read_bytes(E2*, const byte *,  const int); 
@@ -113,12 +116,12 @@ void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_neg(E2*, const E2*);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
-bool_t      E2_in_G2(const E2*);
+bool      E2_in_G2(const E2*);
 void        unsafe_map_bytes_to_G2(E2*, const byte*, int);
 BLST_ERROR  unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // pairing and Fp12
-bool_t      Fp12_is_one(Fp12*);
+bool      Fp12_is_one(Fp12*);
 void        Fp12_set_one(Fp12*);
 void        multi_pairing(Fp12*, const E1*, const E2*, const int);
 
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index e160a16e7c9..78a87823b4c 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -7,7 +7,7 @@
 // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)]
 // and stores it in `res`, where t is the degree of the polynomial P.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indices[], const int len){
+static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[], const int len){
 
     // coefficient is computed as N * D^(-1)
     Fr numerator;  // eventually would represent N*R^k  
@@ -63,7 +63,7 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
 // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const uint8_t indices[], const int len) {
+static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const byte indices[], const int len) {
     // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1 
     // where A_i = g1 ^ a_i
 
@@ -83,7 +83,7 @@ static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const ui
 // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] 
 // and writes their E1 concatenated serializations [shares(1)..shares(t+1)] in `dest`.
 // `len` is equal to `t+1` where `t` is the polynomial degree.
-int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) {
+int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const byte indices[], const int len) {
     int read_ret;
     E1* E1_shares = malloc(sizeof(E1) * len);
     for (int i=0; i < len; i++) {
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 1275b10bab4..3937f8ce965 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -3,7 +3,7 @@
 
 #include "bls_include.h"
 
-int             E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int);
+int             E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const byte[], const int);
 extern void     Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
 
 #endif
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 89966463c61..20c2fcad5df 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -1,63 +1,19 @@
 #ifndef __BLST_INCLUDE_H__
 #define __BLST_INCLUDE_H__
 
-// extra tools to use BLST low level that are needed by the Flow crypto library
-// eventually this file would replace blst.h
-
+// BLST src headers
 #include "point.h"
 #include "fields.h"
 #include "consts.h"
-#include "errors.h"
-
-// types used by the Flow crypto library that are imported from BLST
-// these type definitions are used as an abstraction from BLST internal types
-
-// Parts of this file have been copied from blst.h in the BLST repo
-/*
- * Copyright Supranational LLC
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#ifdef __SIZE_TYPE__
-typedef __SIZE_TYPE__ size_t;
-#else
-#include <stddef.h>
-#endif
-
-#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \
-                            && defined(__UINT64_TYPE__)
-typedef __UINT8_TYPE__  uint8_t;
-typedef __UINT32_TYPE__ uint32_t;
-typedef __UINT64_TYPE__ uint64_t;
-#else
-#include <stdint.h>
-#endif
-
-typedef uint8_t byte;
-
-#ifdef __cplusplus
-extern "C" {
-#elif defined(__BLST_CGO__)
-typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901
-# define bool _Bool
-#else
-# define bool int
-#endif
-
-#ifdef SWIG
-# define DEFNULL =NULL
-#elif defined __cplusplus
-# define DEFNULL =0
-#else
-# define DEFNULL
-#endif
+#include "errors.h" 
 
 // TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last
 // enum value (eventually submit a fix to BLST)
 #define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1)
 
+// types used by the Flow crypto library that are imported from BLST
+// these type definitions are used as an abstraction from BLST internal types
+
 // field elements F_r
 // where `r` is the order of G1/G2.
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 2b34572089c..9966fbcfc37 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -78,7 +78,7 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
 // checks the discrete log relationship in G2.
 // - returns 1 if g2^x = y, where g2 is the generator of G2
 // - returns 0 otherwise.
-bool_t G2_check_log(const Fr* x, const E2* y) {
+bool G2_check_log(const Fr* x, const E2* y) {
     E2 tmp;
     G2_mult_gen(&tmp, x);
     return E2_is_equal(&tmp, y);
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index dd81bbcd79c..98420cc87cf 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -473,9 +473,9 @@ func readVerifVector(A []pointE2, src []byte) error {
 
 func (s *feldmanVSSstate) verifyShare() bool {
 	// check y[current] == x.G2
-	return C.G2_check_log(
+	return bool(C.G2_check_log(
 		(*C.Fr)(&s.x),
-		(*C.E2)(&s.y[s.myIndex])) != 0
+		(*C.E2)(&s.y[s.myIndex])))
 }
 
 // computePublicKeys extracts the participants public keys from the verification vector
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index 620c962faaa..a7de2fe93d9 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -511,9 +511,10 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index)
 // - true if the complaint answer is not correct
 func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool {
 	// check y[complainer] == share.G2
-	return C.G2_check_log(
+	isLog := C.G2_check_log(
 		(*C.Fr)(&c.answer),
-		(*C.E2)(&s.y[complainer])) == 0
+		(*C.E2)(&s.y[complainer]))
+	return !bool(isLog)
 }
 
 // data = |complainee|
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index ca6619eb10f..8d3bdc7e1d7 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -8,6 +8,6 @@ void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, cons
 void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg);
 void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
 BLST_ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
-bool_t      G2_check_log(const Fr* x, const E2* y);
+bool      G2_check_log(const Fr* x, const E2* y);
 
 #endif

From fb4ac123b967fdbe4d3f1b676d6f299367a81ecf Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sat, 27 May 2023 22:11:45 -0600
Subject: [PATCH 118/200] add sanity check scalar mult in G1 and G2

---
 crypto/bls12381_utils_test.go | 36 +++++++++++++++++++++++++++++++++++
 crypto/ecdsa_test.go          |  2 +-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 69d7e687f9b..17a1526414a 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -9,6 +9,42 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// Sanity check of G1 and G2 scalar multiplication
+func TestScalarMultBLS12381(t *testing.T) {
+	expoBytes, err := hex.DecodeString("444465cb6cc2dba9474e6beeb6a9013fbf1260d073429fb14a31e63e89129390")
+	require.NoError(t, err)
+
+	var expo scalar
+	isZero := mapToFr(&expo, expoBytes)
+	require.False(t, isZero)
+
+	// G1 generator multiplication
+	// Note that generator and random point multiplications
+	// are implemented with the same algorithm
+	t.Run("G1", func(t *testing.T) {
+		var p pointE1
+		generatorScalarMultG1(&p, &expo)
+		expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4")
+		require.NoError(t, err)
+		pBytes := make([]byte, SignatureLenBLSBLS12381)
+		writePointE1(pBytes, &p)
+		assert.Equal(t, pBytes, expected)
+	})
+
+	// G2 generator multiplication
+	// Note that generator and random point multiplications
+	// are implemented with the same algorithm
+	t.Run("G2", func(t *testing.T) {
+		var p pointE2
+		generatorScalarMultG2(&p, &expo)
+		expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e")
+		require.NoError(t, err)
+		pBytes := make([]byte, PubKeyLenBLSBLS12381)
+		writePointE2(pBytes, &p)
+		assert.Equal(t, pBytes, expected)
+	})
+}
+
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMult(b *testing.B) {
 	seed := make([]byte, securityBits/8)
diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go
index d5d38f8e947..6a69453816d 100644
--- a/crypto/ecdsa_test.go
+++ b/crypto/ecdsa_test.go
@@ -157,7 +157,7 @@ func TestECDSAUtils(t *testing.T) {
 // TestScalarMult is a unit test of the scalar multiplication
 // This is only a sanity check meant to make sure the curve implemented
 // is checked against an independant test vector
-func TestScalarMult(t *testing.T) {
+func TestScalarMultP256_secp256k1(t *testing.T) {
 	secp256k1 := secp256k1Instance.curve
 	p256 := p256Instance.curve
 	genericMultTests := []struct {

From 0232a953b2040dda260de7aa4c761f8404efb1e5 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sun, 28 May 2023 00:27:37 -0600
Subject: [PATCH 119/200] use not enough shares error in
 BLSReconstructThresholdSignature

---
 crypto/bls_thresholdsign.go      | 14 +++++++++-----
 crypto/bls_thresholdsign_test.go |  9 +++++++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 3cef4d4e605..a7eaad5a2a4 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -437,10 +437,14 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 //
 // size is the number of participants, it must be in the range [ThresholdSignMinSize..ThresholdSignMaxSize].
 // threshold is the threshold value, it must be in the range [MinimumThreshold..size-1].
-// The function does not check the validity of the shares, and does not check
-// the validity of the resulting signature.
+// The function does not accept any input public key. Therefore, it does not check the validity of the
+// shares against individual public keys, and does not check the validity of the resulting signature
+// against the group public key.
 // BLSReconstructThresholdSignature returns:
-//   - (nil, error) if the inputs are not in the correct range, if the threshold is not reached
+//   - (nil, invalidInputsError) if :
+//     -- numbers of shares does not match the number of signers
+//     -- the inputs are not in the correct range.
+//   - (nil, notEnoughSharesError) if the threshold is not reached.
 //   - (nil, duplicatedSignerError) if input signers are not distinct.
 //   - (nil, invalidSignatureError) if at least one of the first (threshold+1) signatures.
 //     does not serialize to a valid E1 point.
@@ -470,8 +474,8 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 	}
 
 	if len(shares) < threshold+1 {
-		return nil, invalidInputsErrorf(
-			"the number of signatures does not reach the threshold")
+		return nil, notEnoughSharesErrorf(
+			"the number of signatures %d is less than the minimum %d", len(shares), threshold+1)
 	}
 
 	// map to check signers are distinct
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 3e55f3d1806..20d578db264 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -594,9 +594,14 @@ func testCentralizedStatelessAPI(t *testing.T) {
 			signers[randomDuplicate] = tmp
 		}
 
+		// check with not enough signatures
+		thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares[:threshold], signers[:threshold])
+		assert.Error(t, err)
+		assert.True(t, IsNotEnoughSharesError(err))
+		assert.Nil(t, thresholdSignature)
+
 		// check with an invalid signature (invalid serialization)
-		invalidSig := make([]byte, signatureLengthBLSBLS12381)
-		signShares[0] = invalidSig
+		signShares[0] = BLSInvalidSignature()
 		thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1])
 		assert.Error(t, err)
 		assert.True(t, IsInvalidSignatureError(err))

From 5fa28df293e2490da9b77d9df86edb492ced274a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Sun, 28 May 2023 01:47:16 -0600
Subject: [PATCH 120/200] refactor BLS constants to use internal BLS12_381
 length constants

---
 crypto/bls.go                 | 35 ++++++++++++++---------------------
 crypto/bls12381_utils.c       | 12 ++++++++++++
 crypto/bls12381_utils.go      | 14 +++++++++-----
 crypto/bls12381_utils.h       |  3 +++
 crypto/bls12381_utils_test.go | 10 +++++-----
 crypto/bls_crossBLST_test.go  |  2 +-
 crypto/dkg_feldmanvss.go      |  8 ++++----
 crypto/dkg_feldmanvssq.go     |  4 ++--
 8 files changed, 50 insertions(+), 38 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 3206a29cdf9..65c7ce4d390 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -39,24 +39,16 @@ import (
 	"github.com/onflow/flow-go/crypto/hash"
 )
 
-const (
-	// BLS12-381
-	// p size in bytes, where G1 is defined over the field Zp
-	fieldSize = 48
-	//
-	// 1 for compressed, 0 for uncompressed - values should not be changed
-	uncompressed = 0 //nolint
-	compressed   = 1
-	// Points compression when serialized
-	serializationG1 = compressed
-	serializationG2 = compressed
-	//
-	// SignatureLenBLSBLS12381 is the size of G1 elements
-	SignatureLenBLSBLS12381 = fieldSize * (2 - serializationG1) // the length is divided by 2 if compression is on
-	PrKeyLenBLSBLS12381     = 32                                // equal to frBytesLen
-	// PubKeyLenBLSBLS12381 is the size of G2 elements
-	PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on
+var (
+	// SignatureLenBLSBLS12381 is the size of a `G_1` element.
+	SignatureLenBLSBLS12381 = g1BytesLen
+	// PubKeyLenBLSBLS12381 is the size of a `G_2` element.
+	PubKeyLenBLSBLS12381 = g2BytesLen
+	// PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`.
+	PrKeyLenBLSBLS12381 = frBytesLen
+)
 
+const (
 	// Hash to curve params
 	// hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_
 	h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_"
@@ -70,8 +62,7 @@ const (
 )
 
 // expandMsgOutput is the output length of the expand_message step as required by the
-// hash_to_curve algorithm (and the map to G1 step)
-//
+// hash_to_curve algorithm (and the map to G1 step).
 // (Cgo does not export C macros)
 var expandMsgOutput = int(C.get_mapToG1_input_len())
 
@@ -360,7 +351,8 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 // decodePublicKeyCompressed decodes a slice of bytes into a public key.
 // since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed
 func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) {
-	if serializationG2 != compressed {
+	// in compression mode, g2BytesLen is equal to 2 * Fp_bytes
+	if g2BytesLen != 2*fpBytesLen {
 		panic("library is not configured to use compressed public key serialization")
 	}
 	return a.decodePublicKey(publicKeyBytes)
@@ -490,7 +482,8 @@ func (pk *pubKeyBLSBLS12381) Size() int {
 // The encoding is a compressed encoding of the point
 // [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-
 func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
-	if serializationG2 != compressed {
+	// in compression mode, g2BytesLen is equal to 2 * Fp_bytes
+	if g2BytesLen != 2*fpBytesLen {
 		panic("library is not configured to use compressed public key serialization")
 	}
 	dest := make([]byte, pubKeyLengthBLSBLS12381)
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 07224cd4242..00d32ccdfed 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -24,6 +24,18 @@ int get_Fr_BYTES() {
     return Fr_BYTES;
 }
 
+int get_Fp_BYTES() {
+    return Fp_BYTES;
+}
+
+int get_G1_SER_BYTES() {
+    return G1_SER_BYTES;
+}
+
+int get_G2_SER_BYTES() {
+    return G2_SER_BYTES;
+}
+
 int get_mapToG1_input_len() {
     return MAP_TO_G1_INPUT_LEN;
 }
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 94083cf9abe..f0a6c77a12d 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -44,8 +44,12 @@ type scalar C.Fr
 // TODO: For now scalars are represented as field elements Fr since all scalars
 // are less than r - check if distinguishing two types in necessary
 
-// BLS12-381 related lengths
+// BLS12-381 related lengths, exported as functions
+// because cgo does not recognize C macros.
 var frBytesLen = int(C.get_Fr_BYTES())
+var g1BytesLen = int(C.get_G1_SER_BYTES())
+var g2BytesLen = int(C.get_G2_SER_BYTES())
+var fpBytesLen = int(C.get_Fp_BYTES())
 
 // get some constants from the C layer
 // (Cgo does not export C macros)
@@ -151,14 +155,14 @@ func writeScalar(dest []byte, x *scalar) {
 }
 
 // writePointE2 writes a G2 point in a slice of bytes
-// The slice should be of size PubKeyLenBLSBLS12381 and the serialization
+// The slice should be of size g2BytesLen and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointE2(dest []byte, a *pointE2) {
 	C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a))
 }
 
 // writePointE1 writes a G1 point in a slice of bytes
-// The slice should be of size SignatureLenBLSBLS12381 and the serialization
+// The slice should be of size g1BytesLen and the serialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves
 func writePointE1(dest []byte, a *pointE1) {
 	C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a))
@@ -187,7 +191,7 @@ func readScalarFrStar(a *scalar, src []byte) error {
 }
 
 // readPointE2 reads a E2 point from a slice of bytes
-// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization
+// The slice is expected to be of size g2BytesLen and the deserialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
 // No G2 membership check is performed.
 func readPointE2(a *pointE2, src []byte) error {
@@ -208,7 +212,7 @@ func readPointE2(a *pointE2, src []byte) error {
 }
 
 // readPointE1 reads a E1 point from a slice of bytes
-// The slice should be of size SignatureLenBLSBLS12381 and the deserialization
+// The slice should be of size g1BytesLen and the deserialization
 // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves.
 // No G1 membership check is performed.
 func readPointE1(a *pointE1, src []byte) error {
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 3e8ca1b06ea..f3800e6ebef 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -46,6 +46,9 @@ typedef _Bool bool;  // assuming cgo is using a modern enough compiler
 int      get_valid();
 int      get_invalid();
 int      get_Fr_BYTES();
+int      get_Fp_BYTES();
+int      get_G1_SER_BYTES();
+int      get_G2_SER_BYTES();
 int      get_mapToG1_input_len();
 
 // BLS based SPoCK
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 17a1526414a..2c9d76bbbe5 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -26,7 +26,7 @@ func TestScalarMultBLS12381(t *testing.T) {
 		generatorScalarMultG1(&p, &expo)
 		expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4")
 		require.NoError(t, err)
-		pBytes := make([]byte, SignatureLenBLSBLS12381)
+		pBytes := make([]byte, g1BytesLen)
 		writePointE1(pBytes, &p)
 		assert.Equal(t, pBytes, expected)
 	})
@@ -39,7 +39,7 @@ func TestScalarMultBLS12381(t *testing.T) {
 		generatorScalarMultG2(&p, &expo)
 		expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e")
 		require.NoError(t, err)
-		pBytes := make([]byte, PubKeyLenBLSBLS12381)
+		pBytes := make([]byte, g2BytesLen)
 		writePointE2(pBytes, &p)
 		assert.Equal(t, pBytes, expected)
 	})
@@ -130,7 +130,7 @@ func BenchmarkMapToG1(b *testing.B) {
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
 	prg := getPRG(t)
-	seed := make([]byte, PubKeyLenBLSBLS12381)
+	seed := make([]byte, g2BytesLen)
 	_, err := prg.Read(seed)
 	require.NoError(t, err)
 
@@ -165,7 +165,7 @@ func TestSubgroupCheck(t *testing.T) {
 
 // subgroup membership check bench
 func BenchmarkSubgroupCheck(b *testing.B) {
-	seed := make([]byte, PubKeyLenBLSBLS12381)
+	seed := make([]byte, g2BytesLen)
 	_, err := mrand.Read(seed)
 	require.NoError(b, err)
 
@@ -195,7 +195,7 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 func TestReadWriteG1(t *testing.T) {
 	prg := getPRG(t)
 	seed := make([]byte, frBytesLen)
-	bytes := make([]byte, SignatureLenBLSBLS12381)
+	bytes := make([]byte, g1BytesLen)
 	// generate a random G1 point, encode it, decode it,
 	// and compare it the original point
 	iterations := 50
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index 6d3f1765e25..7629289ba9e 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -134,7 +134,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) {
 // testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST.
 // This test assumes signature serialization is identical to BLST.
 func testEncodeDecodeG1CrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), g1BytesLen, g1BytesLen)
 	validSignatureFlow := rapid.Custom(validSignatureBytesFlow)
 	validSignatureBLST := rapid.Custom(validSignatureBytesBLST)
 	// sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 98420cc87cf..ac76469f962 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -151,11 +151,11 @@ func (s *feldmanVSSstate) End() (PrivateKey, PublicKey, []PublicKey, error) {
 	return x, Y, y, nil
 }
 
-const (
-	shareSize = PrKeyLenBLSBLS12381
+var (
+	shareSize = frBytesLen
 	// the actual verifVectorSize depends on the state and is:
-	// PubKeyLenBLSBLS12381*(t+1)
-	verifVectorSize = PubKeyLenBLSBLS12381
+	// g2BytesLen*(t+1)
+	verifVectorSize = g2BytesLen
 )
 
 // HandleBroadcastMsg processes a new broadcasted message received by the current participant.
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index a7de2fe93d9..b8056b990dc 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -201,9 +201,9 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error)
 	return x, Y, y, nil
 }
 
-const (
+var (
 	complaintSize       = 1
-	complaintAnswerSize = 1 + PrKeyLenBLSBLS12381
+	complaintAnswerSize = 1 + frBytesLen
 )
 
 // HandleBroadcastMsg processes a new broadcasted message received by the current participant.

From d2c7cbf2eb3172f6871cefd1eb7a579d61ae2bc9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 29 May 2023 13:52:29 -0600
Subject: [PATCH 121/200] more consolidation of length constants

---
 crypto/bls.go                | 17 ++++++-----------
 crypto/bls12381_utils.go     |  4 ++--
 crypto/bls_core.c            | 21 ++++-----------------
 crypto/bls_crossBLST_test.go |  4 ++--
 crypto/bls_include.h         | 11 -----------
 crypto/bls_multisig.go       | 12 ++++++------
 crypto/bls_test.go           |  4 ++--
 crypto/bls_thresholdsign.go  |  8 ++++----
 crypto/spock.go              |  2 +-
 9 files changed, 27 insertions(+), 56 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 65c7ce4d390..f515a9445dc 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -184,7 +184,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 		return false, err
 	}
 
-	if len(s) != signatureLengthBLSBLS12381 {
+	if len(s) != SignatureLenBLSBLS12381 {
 		return false, nil
 	}
 
@@ -214,7 +214,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 // 0xC0 is the header of the point at infinity serialization (either in G1 or G2)
 const infinityPointHeader = byte(0xC0)
 
-var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, signatureLengthBLSBLS12381-1)...)
+var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, SignatureLenBLSBLS12381-1)...)
 
 // IsBLSSignatureIdentity checks whether the input signature is
 // the identity signature (point at infinity in G1).
@@ -327,9 +327,9 @@ func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey,
 // a faster check during signature verifications. Any verification against an identity
 // public key outputs `false`.
 func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, error) {
-	if len(publicKeyBytes) != pubKeyLengthBLSBLS12381 {
+	if len(publicKeyBytes) != PubKeyLenBLSBLS12381 {
 		return nil, invalidInputsErrorf("input length must be %d, got %d",
-			pubKeyLengthBLSBLS12381, len(publicKeyBytes))
+			PubKeyLenBLSBLS12381, len(publicKeyBytes))
 	}
 	var pk pubKeyBLSBLS12381
 	err := readPointE2(&pk.point, publicKeyBytes)
@@ -415,7 +415,7 @@ func (sk *prKeyBLSBLS12381) PublicKey() PublicKey {
 // Encode returns a byte encoding of the private key.
 // The encoding is a raw encoding in big endian padded to the group order
 func (a *prKeyBLSBLS12381) Encode() []byte {
-	dest := make([]byte, prKeyLengthBLSBLS12381)
+	dest := make([]byte, frBytesLen)
 	writeScalar(dest, &a.scalar)
 	return dest
 }
@@ -486,7 +486,7 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
 	if g2BytesLen != 2*fpBytesLen {
 		panic("library is not configured to use compressed public key serialization")
 	}
-	dest := make([]byte, pubKeyLengthBLSBLS12381)
+	dest := make([]byte, g2BytesLen)
 	writePointE2(dest, &a.point)
 	return dest
 }
@@ -511,11 +511,6 @@ func (pk *pubKeyBLSBLS12381) String() string {
 	return pk.point.String()
 }
 
-// Get Macro definitions from the C layer as Cgo does not export macros
-var signatureLengthBLSBLS12381 = int(C.get_signature_len())
-var pubKeyLengthBLSBLS12381 = int(C.get_pk_len())
-var prKeyLengthBLSBLS12381 = int(C.get_sk_len())
-
 // This is only a TEST function.
 // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field.
 //
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index f0a6c77a12d..0cc7e75a509 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -70,7 +70,7 @@ func (a *scalar) String() string {
 }
 
 func (p *pointE2) String() string {
-	encoding := make([]byte, pubKeyLengthBLSBLS12381)
+	encoding := make([]byte, g2BytesLen)
 	writePointE2(encoding, p)
 	return fmt.Sprintf("%#x", encoding)
 }
@@ -307,7 +307,7 @@ func hashToG1Bytes(data, dst []byte) []byte {
 	}
 
 	// serialize the point
-	pointBytes := make([]byte, signatureLengthBLSBLS12381)
+	pointBytes := make([]byte, g1BytesLen)
 	writePointE1(pointBytes, &point)
 	return pointBytes
 }
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 6711320cf51..e1578a150fe 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -4,19 +4,6 @@
 
 // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
 
-// functions to export macros to the Go layer (because cgo does not import macros)
-int get_signature_len() {
-    return SIGNATURE_LEN;
-}
-
-int get_pk_len() {
-    return PK_LEN;
-}
-
-int get_sk_len() {
-    return SK_LEN;
-}
-
 // Computes a BLS signature from a G1 point and writes it in `out`.
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
 static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) {
@@ -93,7 +80,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     if (!elemsG2) goto outG2;
 
     // elemsG1[0] = sig
-    if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) {
         ret = INVALID;
         goto out;
     }
@@ -167,7 +154,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
     if (!elemsG2) goto outG2;
 
     // elemsG1[0] = s
-    if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) {
         ret = INVALID;
         goto out;
     }
@@ -243,7 +230,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
 int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) {  
     E1 s, h;
     // deserialize the signature into a curve point
-    if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) {
+    if (E1_read_bytes(&s, sig, G1_SER_BYTES) != BLST_SUCCESS) {
         return INVALID;
     }
 
@@ -393,7 +380,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
         // the tree aggregations remain valid.
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
-        int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN);
+        int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES);
         if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) {
             // set signature and key to infinity (no effect on the aggregation tree)
             // and set result to invalid (result won't be overwritten)
diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go
index 7629289ba9e..3b3939eaf6c 100644
--- a/crypto/bls_crossBLST_test.go
+++ b/crypto/bls_crossBLST_test.go
@@ -80,7 +80,7 @@ func validSignatureBytesBLST(t *rapid.T) []byte {
 // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST.
 // This test assumes private key serialization is identical to the one in BLST.
 func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) {
-	randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381)
+	randomSlice := rapid.SliceOfN(rapid.Byte(), PrKeyLenBLSBLS12381, PrKeyLenBLSBLS12381)
 	validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow)
 	validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST)
 	// skBytes are bytes of either a valid or a random private key
@@ -154,7 +154,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) {
 
 	// check both serializations of G1 points are equal
 	if flowPass && blstPass {
-		sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381)
+		sigFlowOutBytes := make([]byte, g1BytesLen)
 		writePointE1(sigFlowOutBytes, &pointFlow)
 		sigBLSTOutBytes := pointBLST.Compress()
 		assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes)
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 4b8e1075501..1ca61b376c4 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -5,19 +5,8 @@
 
 #include "bls12381_utils.h"
 
-// Signature, Public key and Private key lengths
-#define FULL_SIGNATURE_LEN  G1_BYTES
-#define FULL_PK_LEN         G2_BYTES
-#define SIGNATURE_LEN       (FULL_SIGNATURE_LEN/(G1_SERIALIZATION+1))
-#define PK_LEN              (FULL_PK_LEN/(G2_SERIALIZATION+1))
-#define SK_BITS             (Fr_BITS)
-#define SK_LEN              BITS_TO_BYTES(SK_BITS)    
 
 // bls core (functions in bls_core.c)
-int      get_signature_len();
-int      get_pk_len();
-int      get_sk_len();  
-
 int      bls_sign(byte*, const Fr*, const byte*, const int);
 int      bls_verify(const E2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 5714b7e2a34..6c99ae461e2 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -102,14 +102,14 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 	}
 
 	// flatten the shares (required by the C layer)
-	flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs))
+	flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs))
 	for i, sig := range sigs {
-		if len(sig) != signatureLengthBLSBLS12381 {
+		if len(sig) != SignatureLenBLSBLS12381 {
 			return nil, fmt.Errorf("signature at index %d has an invalid length: %w", i, invalidSignatureError)
 		}
 		flatSigs = append(flatSigs, sig...)
 	}
-	aggregatedSig := make([]byte, signatureLengthBLSBLS12381)
+	aggregatedSig := make([]byte, SignatureLenBLSBLS12381)
 
 	// add the points in the C layer
 	result := C.E1_sum_vector_byte(
@@ -325,7 +325,7 @@ func VerifyBLSSignatureManyMessages(
 ) (bool, error) {
 
 	// check signature length
-	if len(s) != signatureLengthBLSBLS12381 {
+	if len(s) != SignatureLenBLSBLS12381 {
 		return false, nil
 	}
 	// check the list lengths
@@ -494,7 +494,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 	}
 
 	// flatten the shares (required by the C layer)
-	flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs))
+	flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs))
 	pkPoints := make([]pointE2, 0, len(pks))
 
 	getIdentityPoint := func() pointE2 {
@@ -508,7 +508,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 			return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError)
 		}
 
-		if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity {
+		if len(sigs[i]) != SignatureLenBLSBLS12381 || pkBLS.isIdentity {
 			// case of invalid signature: set the signature and public key at index `i`
 			// to identities so that there is no effect on the aggregation tree computation.
 			// However, the boolean return for index `i` is set to `false` and won't be overwritten.
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 801af0a24a5..d8561ccc5f6 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -411,7 +411,7 @@ func TestBLSAggregateSignatures(t *testing.T) {
 		assert.False(t, result)
 
 		// test with a signature of a wrong length
-		shortSig := sigs[0][:signatureLengthBLSBLS12381-1]
+		shortSig := sigs[0][:SignatureLenBLSBLS12381-1]
 		aggSig, err = AggregateBLSSignatures([]Signature{shortSig})
 		assert.Error(t, err)
 		assert.True(t, IsInvalidSignatureError(err))
@@ -1199,7 +1199,7 @@ func TestBLSIdentity(t *testing.T) {
 		sk := randomSK(t, rand)
 		sig, err := sk.Sign(msg, hasher)
 		require.NoError(t, err)
-		oppositeSig := make([]byte, signatureLengthBLSBLS12381)
+		oppositeSig := make([]byte, SignatureLenBLSBLS12381)
 		copy(oppositeSig, sig)
 		negatePoint(oppositeSig)
 		aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig})
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index a7eaad5a2a4..2f05ed72c42 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -399,10 +399,10 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 		return nil, notEnoughSharesErrorf("number of signature shares %d is not enough, %d are required",
 			len(s.shares), s.threshold+1)
 	}
-	thresholdSignature := make([]byte, signatureLengthBLSBLS12381)
+	thresholdSignature := make([]byte, SignatureLenBLSBLS12381)
 
 	// prepare the C layer inputs
-	shares := make([]byte, 0, len(s.shares)*signatureLengthBLSBLS12381)
+	shares := make([]byte, 0, len(s.shares)*SignatureLenBLSBLS12381)
 	signers := make([]index, 0, len(s.shares))
 	for index, share := range s.shares {
 		shares = append(shares, share...)
@@ -482,7 +482,7 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 	m := make(map[index]bool)
 
 	// flatten the shares (required by the C layer)
-	flatShares := make([]byte, 0, signatureLengthBLSBLS12381*(threshold+1))
+	flatShares := make([]byte, 0, SignatureLenBLSBLS12381*(threshold+1))
 	indexSigners := make([]index, 0, threshold+1)
 	for i, share := range shares {
 		flatShares = append(flatShares, share...)
@@ -500,7 +500,7 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 		indexSigners = append(indexSigners, index(signers[i])+1)
 	}
 
-	thresholdSignature := make([]byte, signatureLengthBLSBLS12381)
+	thresholdSignature := make([]byte, SignatureLenBLSBLS12381)
 	// Lagrange Interpolate at point 0
 	if C.E1_lagrange_interpolate_at_zero_write(
 		(*C.uchar)(&thresholdSignature[0]),
diff --git a/crypto/spock.go b/crypto/spock.go
index 8180b9b72bd..da269c23ac1 100644
--- a/crypto/spock.go
+++ b/crypto/spock.go
@@ -73,7 +73,7 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur
 		return false, notBLSKeyError
 	}
 
-	if len(proof1) != signatureLengthBLSBLS12381 || len(proof2) != signatureLengthBLSBLS12381 {
+	if len(proof1) != g1BytesLen || len(proof2) != g1BytesLen {
 		return false, nil
 	}
 

From a88897291207f75d455f82e8af45dd863e444198 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 29 May 2023 15:23:51 -0600
Subject: [PATCH 122/200] cgo supports macros! use C constant macros in go

---
 crypto/bls.go            |  8 +++-----
 crypto/bls12381_utils.c  | 33 +--------------------------------
 crypto/bls12381_utils.go | 33 +++++++++++++++------------------
 crypto/bls12381_utils.h  |  9 ---------
 4 files changed, 19 insertions(+), 64 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index f515a9445dc..9930db2a3b4 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -59,13 +59,11 @@ const (
 	// Cipher suite used for BLS PoP of the form : BLS_POP_ || h2cSuiteID || SchemeTag_
 	// The PoP cipher suite is guaranteed to be different than all signature ciphersuites
 	blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag
+	// expandMsgOutput is the output length of the expand_message step as required by the
+	// hash_to_curve algorithm (and the map to G1 step).
+	expandMsgOutput = int(C.MAP_TO_G1_INPUT_LEN)
 )
 
-// expandMsgOutput is the output length of the expand_message step as required by the
-// hash_to_curve algorithm (and the map to G1 step).
-// (Cgo does not export C macros)
-var expandMsgOutput = int(C.get_mapToG1_input_len())
-
 // blsBLS12381Algo, embeds SignAlgo
 type blsBLS12381Algo struct {
 	// the signing algo and parameters
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 00d32ccdfed..5edbb92b78e 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -9,37 +9,6 @@
 // compile all blst C src along with this file
 #include "blst_src.c"
 
-// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
-
-// return macro values to the upper Go Layer
-int get_valid() {
-    return VALID;
-}
-
-int get_invalid() {
-    return INVALID;
-}
-
-int get_Fr_BYTES() {
-    return Fr_BYTES;
-}
-
-int get_Fp_BYTES() {
-    return Fp_BYTES;
-}
-
-int get_G1_SER_BYTES() {
-    return G1_SER_BYTES;
-}
-
-int get_G2_SER_BYTES() {
-    return G2_SER_BYTES;
-}
-
-int get_mapToG1_input_len() {
-    return MAP_TO_G1_INPUT_LEN;
-}
-
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
@@ -47,7 +16,7 @@ int get_mapToG1_input_len() {
 const Fr BLS12_381_rR = {{  \
     TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
     TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \
-    }};
+}};
 
 // returns true if a == 0 and false otherwise
 bool Fr_is_zero(const Fr* a) {
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 0cc7e75a509..75b9385d3ab 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -44,24 +44,21 @@ type scalar C.Fr
 // TODO: For now scalars are represented as field elements Fr since all scalars
 // are less than r - check if distinguishing two types in necessary
 
-// BLS12-381 related lengths, exported as functions
-// because cgo does not recognize C macros.
-var frBytesLen = int(C.get_Fr_BYTES())
-var g1BytesLen = int(C.get_G1_SER_BYTES())
-var g2BytesLen = int(C.get_G2_SER_BYTES())
-var fpBytesLen = int(C.get_Fp_BYTES())
-
-// get some constants from the C layer
-// (Cgo does not export C macros)
-var valid = C.get_valid()
-var invalid = C.get_invalid()
-
-// get some constants from the C layer
-// var blst_errors = C.blst_get_errors()
-var blst_valid = (int)(C.BLST_SUCCESS)
-var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING)
-var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR)
-var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE)
+const (
+	// BLS12-381 related lengths imported from the C layer
+	frBytesLen = int(C.Fr_BYTES)
+	g1BytesLen = int(C.G1_SER_BYTES)
+	g2BytesLen = int(C.G2_SER_BYTES)
+	fpBytesLen = int(C.Fp_BYTES)
+
+	// more internal constants from the C layer
+	valid                   = C.VALID
+	invalid                 = C.INVALID
+	blst_valid              = int(C.BLST_SUCCESS)
+	blst_bad_encoding       = int(C.BLST_BAD_ENCODING)
+	blst_bad_scalar         = int(C.BLST_BAD_SCALAR)
+	blst_point_not_on_curve = int(C.BLST_POINT_NOT_ON_CURVE)
+)
 
 func (a *scalar) String() string {
 	encoding := make([]byte, frBytesLen)
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index f3800e6ebef..48a7b1476de 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -42,15 +42,6 @@ typedef _Bool bool;  // assuming cgo is using a modern enough compiler
 #define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
 #define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
-
-int      get_valid();
-int      get_invalid();
-int      get_Fr_BYTES();
-int      get_Fp_BYTES();
-int      get_G1_SER_BYTES();
-int      get_G2_SER_BYTES();
-int      get_mapToG1_input_len();
-
 // BLS based SPoCK
 int     bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
 

From 945f2b9fb3feca4bb5aeeb5efcbfeffd6411647f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 29 May 2023 19:16:32 -0600
Subject: [PATCH 123/200] define new internal ERROR type to abstract BLST_ERROR

---
 crypto/bls.go                   |   4 +-
 crypto/bls12381_utils.c         | 270 +++++++++++++-------------------
 crypto/bls12381_utils.go        |  49 +++---
 crypto/bls12381_utils.h         |  61 ++++----
 crypto/bls_core.c               |  58 ++++++-
 crypto/bls_include.h            |   6 +-
 crypto/bls_multisig.go          |   7 -
 crypto/bls_thresholdsign_core.c |   4 +-
 crypto/blst_include.h           |   5 -
 crypto/dkg_core.c               |   6 +-
 crypto/dkg_feldmanvss.go        |   2 +-
 crypto/dkg_include.h            |   2 +-
 12 files changed, 231 insertions(+), 243 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 9930db2a3b4..93dd487a817 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -39,16 +39,14 @@ import (
 	"github.com/onflow/flow-go/crypto/hash"
 )
 
-var (
+const (
 	// SignatureLenBLSBLS12381 is the size of a `G_1` element.
 	SignatureLenBLSBLS12381 = g1BytesLen
 	// PubKeyLenBLSBLS12381 is the size of a `G_2` element.
 	PubKeyLenBLSBLS12381 = g2BytesLen
 	// PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`.
 	PrKeyLenBLSBLS12381 = frBytesLen
-)
 
-const (
 	// Hash to curve params
 	// hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_
 	h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_"
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 5edbb92b78e..35bf1ff4686 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -172,40 +172,40 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) {
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
 // input is bytes-big-endian.
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fr
-//    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
+//    - BAD_ENCODING if the length is invalid
+//    - BAD_VALUE if the scalar isn't in Fr
+//    - VALID if the scalar is valid 
+ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
     if (len != Fr_BYTES) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     }
     pow256 tmp;
     // compare to r using the provided tool from BLST 
     pow256_from_be_bytes(tmp, bin);  // TODO: check endianness!!
     if (!check_mod_256(tmp, BLS12_381_r)) {  // check_mod_256 compares pow256 against a vec256!
-        return BLST_BAD_SCALAR;
+        return BAD_VALUE;
     }
     vec_zero(tmp, sizeof(tmp));
     limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); // TODO: check endianness!!
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
 // input bytes are big endian.
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fr_star
-//    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) {
+//    - BAD_ENCODING if the length is invalid
+//    - BAD_VALUE if the scalar isn't in Fr_star
+//    - VALID if the scalar is valid 
+ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) {
     int ret = Fr_read_bytes(a, bin, len);
-    if (ret != BLST_SUCCESS) {
+    if (ret != VALID) {
         return ret;
     }
     // check if a=0
     if (Fr_is_zero(a)) {
-        return BLST_BAD_SCALAR;
+        return BAD_VALUE;
     }
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // write Fr element `a` in big endian bytes.
@@ -329,19 +329,19 @@ void Fp_from_montg(Fp *res, const Fp *a) {
 // reads a scalar in `a` and checks it is a valid Fp element (a < p).
 // input is bytes-big-endian.
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fp
-//    - BLST_SUCCESS if the scalar is valid 
-BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
+//    - BAD_ENCODING if the length is invalid
+//    - BAD_VALUE if the scalar isn't in Fp
+//    - VALID if the scalar is valid 
+ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
     if (len != Fp_BYTES) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     }
     limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES);
     // compare read scalar to p
     if (!Fp_check(a)) {
-        return BLST_BAD_ENCODING;
+        return BAD_VALUE;
     }       
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 
@@ -413,22 +413,22 @@ static byte Fp2_get_sign(Fp2* y) {
 // input is a serialization of real(a) concatenated to serializetion of imag(a).
 // a[i] are both Fp elements.
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid
-//    - BLST_BAD_SCALAR if the scalar isn't in Fp
-//    - BLST_SUCCESS if the scalar is valid 
-static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) {
+//    - BAD_ENCODING if the length is invalid
+//    - BAD_VALUE if the scalar isn't in Fp
+//    - VALID if the scalar is valid 
+static ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) {
     if (len != Fp2_BYTES) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     }
-    BLST_ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
-    if (ret != BLST_SUCCESS) {
+    ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
+    if (ret != VALID) {
         return ret;
     }
     ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES);
-    if ( ret != BLST_SUCCESS) {
+    if ( ret != VALID) {
         return ret;
     }
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes.  
@@ -494,23 +494,23 @@ bool E1_in_G1(const E1* p){
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
 //
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid
-//    - BLST_BAD_SCALAR if Fp coordinates couldn't deserialize
-//    - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E1
-//    - BLST_SUCCESS if deserialization is valid 
+//    - BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BAD_VALUE if Fp coordinates couldn't deserialize
+//    - POINT_NOT_ON_CURVE if deserialized point isn't on E1
+//    - VALID if deserialization is valid 
 
 // TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, 
 //       and update logic with G2 subgroup check?
-BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
+ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     // check the length
     if (len != G1_SER_BYTES) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     }
 
     // check the compression bit
     int compressed = bin[0] >> 7;
     if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     } 
 
     // check if the point in infinity
@@ -518,29 +518,29 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     if (is_infinity) {
         // the remaining bits need to be cleared
         if (bin[0] & 0x3F) {
-            return BLST_BAD_ENCODING;
+            return BAD_ENCODING;
         }
         for (int i=1; i<G1_SER_BYTES-1; i++) {
             if (bin[i]) {
-                return BLST_BAD_ENCODING;
+                return BAD_ENCODING;
             } 
         }
 		E1_set_infty(a);
-		return BLST_SUCCESS;
+		return VALID;
 	} 
 
     // read the sign bit and check for consistency
     int y_sign = (bin[0] >> 5) & 1;
     if (y_sign && (!compressed)) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     } 
     
     // use a temporary buffer to mask the header bits and read a.x
     byte temp[Fp_BYTES];
     memcpy(temp, bin, Fp_BYTES);
     temp[0] &= 0x1F;        // clear the header bits
-    BLST_ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
-    if (ret != BLST_SUCCESS) {
+    ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
+    if (ret != VALID) {
         return ret;
     }
 
@@ -549,14 +549,14 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
 
     if (G1_SERIALIZATION == UNCOMPRESSED) {
         ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y));
-        if (ret != BLST_SUCCESS){ 
+        if (ret != VALID){ 
             return ret;
         }
         // check read point is on curve
         if (!E1_affine_on_curve(a)) { 
-            return BLST_POINT_NOT_ON_CURVE;
+            return POINT_NOT_ON_CURVE;
         }
-        return BLST_SUCCESS;
+        return VALID;
     }
     
     // compute the possible square root
@@ -565,13 +565,13 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     Fp_mul_montg(&a->y, &a->y, &a->x);    // x^3
     Fp_add(&a->y, &a->y, &B_E1);          // B_E1 is already in Montg form             
     if (!Fp_sqrt_montg(&a->y, &a->y))     // check whether x^3+b is a quadratic residue
-        return BLST_POINT_NOT_ON_CURVE; 
+        return POINT_NOT_ON_CURVE; 
 
     // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
     if (Fp_get_sign(&a->y) != y_sign) {
         Fp_neg(&a->y, &a->y); // flip y sign if needed
     }
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form.
@@ -649,7 +649,7 @@ int E1_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int sigs_len) {
     // import the points from the array
     for (int i=0; i < n; i++) {
         // deserialize each point from the input array
-        if  (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != BLST_SUCCESS) {
+        if  (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != VALID) {
             error = INVALID; 
             goto out;
         }
@@ -706,9 +706,46 @@ int map_to_G1(E1* h, const byte* hash, const int len) {
     return VALID;
 }
 
+// maps the bytes to a point in G1.
+// `len` should be at least Fr_BYTES.
+// this is a testing file only, should not be used in any protocol!
+void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
+    assert(len >= Fr_BYTES);
+    // map to Fr
+    Fr log;
+    map_bytes_to_Fr(&log, bytes, len);
+    // multiplies G1 generator by a random scalar
+    G1_mult_gen(p, &log);
+}
+
+// generates a point in E1\G1 and stores it in p
+// this is a testing file only, should not be used in any protocol!
+ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
+    assert(G1_SERIALIZATION == COMPRESSED);
+    assert(len >= G1_SER_BYTES);
+
+    // attempt to deserilize a compressed E1 point from input bytes
+    // after fixing the header 2 bits
+    byte copy[G1_SER_BYTES];
+    memcpy(copy, bytes, sizeof(copy));
+    copy[0] |= 1<<7;        // set compression bit
+    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
+
+    ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES);
+    if (ser != VALID) {
+        return ser;
+    }
+
+    // map the point to E2\G2 by clearing G2 order
+    E1_mult(p, p, (const Fr*)BLS12_381_r);
+    E1_to_affine(p, p);
+
+    assert(E1_affine_on_curve(p));  // sanity check to make sure p is in E2
+    return VALID;
+}
+
 // ------------------- E2 utilities
 
-const E1* BLS12_381_g1 = (const E1*)&BLS12_381_G1; /// TODO:delete
 const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2;
 const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2;
 
@@ -716,23 +753,23 @@ const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2;
 // The resulting point is guaranteed to be on curve E2 (no G2 check is included).
 //
 // returns:
-//    - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid
-//    - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize
-//    - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2
-//    - BLST_SUCCESS if deserialization is valid 
+//    - BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BAD_VALUE if Fp^2 coordinates couldn't deserialize
+//    - POINT_NOT_ON_CURVE if deserialized point isn't on E2
+//    - VALID if deserialization is valid 
 
 // TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, 
 //       and update logic with G2 subgroup check?
-BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
+ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     // check the length
     if (len != G2_SER_BYTES) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     }
 
     // check the compression bit
     int compressed = bin[0] >> 7;
     if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     } 
 
     // check if the point in infinity
@@ -740,29 +777,29 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     if (is_infinity) {
         // the remaining bits need to be cleared
         if (bin[0] & 0x3F) {
-            return BLST_BAD_ENCODING;
+            return BAD_ENCODING;
         }
         for (int i=1; i<G2_SER_BYTES-1; i++) {
             if (bin[i]) {
-                return BLST_BAD_ENCODING;
+                return BAD_ENCODING;
             } 
         }
 		E2_set_infty(a);
-		return BLST_SUCCESS;
+		return VALID;
 	} 
 
     // read the sign bit and check for consistency
     int y_sign = (bin[0] >> 5) & 1;
     if (y_sign && (!compressed)) {
-        return BLST_BAD_ENCODING;
+        return BAD_ENCODING;
     } 
     
     // use a temporary buffer to mask the header bits and read a.x
     byte temp[Fp2_BYTES];
     memcpy(temp, bin, Fp2_BYTES);
     temp[0] &= 0x1F;        // clear the header bits
-    BLST_ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
-    if (ret != BLST_SUCCESS) {
+    ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
+    if (ret != VALID) {
         return ret;
     }
 
@@ -773,14 +810,14 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
 
     if (G2_SERIALIZATION == UNCOMPRESSED) {
         ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y));
-        if (ret != BLST_SUCCESS){ 
+        if (ret != VALID){ 
             return ret;
         }
         // check read point is on curve
         if (!E2_affine_on_curve(a)) { 
-            return BLST_POINT_NOT_ON_CURVE;
+            return POINT_NOT_ON_CURVE;
         }
-        return BLST_SUCCESS;
+        return VALID;
     }
     
     // compute the possible square root
@@ -793,13 +830,13 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     Fp2_mul_montg(a_y, a_y, a_x);
     Fp2_add(a_y, a_y, &B_E2);          // B_E2 is already in Montg form             
     if (!Fp2_sqrt_montg(a_y, a_y))    // check whether x^3+b is a quadratic residue
-        return BLST_POINT_NOT_ON_CURVE; 
+        return POINT_NOT_ON_CURVE; 
 
     // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
     if (Fp2_get_sign(a_y) != y_sign) {
         Fp2_neg(a_y, a_y); // flip y sign if needed
     }
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form.
@@ -929,52 +966,6 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){
     }
 }
 
-// ------------------- other
-
-
-// Verifies the validity of 2 SPoCK proofs and 2 public keys.
-// Membership check in G1 of both proofs is verified in this function.
-// Membership check in G2 of both keys is not verified in this function.
-// the membership check in G2 is separated to allow optimizing multiple verifications 
-// using the same public keys.
-int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
-    E1 elemsG1[2];
-    E2 elemsG2[2];
-
-    // elemsG1[0] = s1
-    if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != BLST_SUCCESS) {
-        return INVALID;
-    };
-    // check s1 is in G1
-    if (!E1_in_G1(&elemsG1[0]))  {
-        return INVALID;
-    }
-
-    // elemsG1[1] = s2
-    if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != BLST_SUCCESS) {
-        return INVALID;
-    };
-    // check s2 is in G1
-    if (!E1_in_G1(&elemsG1[1]))  {
-        return INVALID;
-    }
-
-    // elemsG2[1] = pk1
-    E2_copy(&elemsG2[1], pk1);
-
-    // elemsG2[0] = -pk2
-    E2_neg(&elemsG2[0], pk2);
-
-    // double pairing
-    Fp12 e;
-    multi_pairing(&e, elemsG1 , elemsG2, 2);
-
-    if (Fp12_is_one(&e)) {
-        return VALID; 
-    } 
-    return INVALID; 
-}
-
 // Subtracts all G2 array elements `y` from an element `x` and writes the 
 // result in res
 void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
@@ -983,45 +974,6 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
     E2_add(res, x, res);
 }
 
-
-// maps the bytes to a point in G1.
-// `len` should be at least Fr_BYTES.
-// this is a testing file only, should not be used in any protocol!
-void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
-    assert(len >= Fr_BYTES);
-    // map to Fr
-    Fr log;
-    map_bytes_to_Fr(&log, bytes, len);
-    // multiplies G1 generator by a random scalar
-    G1_mult_gen(p, &log);
-}
-
-// generates a point in E1\G1 and stores it in p
-// this is a testing file only, should not be used in any protocol!
-BLST_ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
-    assert(G1_SERIALIZATION == COMPRESSED);
-    assert(len >= G1_SER_BYTES);
-
-    // attempt to deserilize a compressed E1 point from input bytes
-    // after fixing the header 2 bits
-    byte copy[G1_SER_BYTES];
-    memcpy(copy, bytes, sizeof(copy));
-    copy[0] |= 1<<7;        // set compression bit
-    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
-
-    BLST_ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES);
-    if (ser != BLST_SUCCESS) {
-        return ser;
-    }
-
-    // map the point to E2\G2 by clearing G2 order
-    E1_mult(p, p, (const Fr*)BLS12_381_r);
-    E1_to_affine(p, p);
-
-    assert(E1_affine_on_curve(p));  // sanity check to make sure p is in E2
-    return BLST_SUCCESS;
-}
-
 // maps the bytes to a point in G2.
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
@@ -1035,11 +987,11 @@ void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
 }
 
 // attempts to map `bytes` to a point in E2\G2 and stores it in p.
-// `len` should be at least G2_SER_BYTES. It returns BLST_SUCCESS only if mapping 
+// `len` should be at least G2_SER_BYTES. It returns VALID only if mapping 
 // succeeds.
 // For now, function only works when E2 serialization is compressed.
 // this is a testing tool only, it should not be used in any protocol!
-BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
+ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     assert(G2_SERIALIZATION == COMPRESSED);
     assert(len >= G2_SER_BYTES);
 
@@ -1050,8 +1002,8 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     copy[0] |= 1<<7;        // set compression bit
     copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
 
-    BLST_ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES);
-    if (ser != BLST_SUCCESS) {
+    ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES);
+    if (ser != VALID) {
         return ser;
     }
 
@@ -1060,7 +1012,7 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
     E2_to_affine(p, p);
 
     assert(E2_affine_on_curve(p));  // sanity check to make sure p is in E2
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // ------------------- Pairing utilities 
@@ -1079,7 +1031,7 @@ void Fp12_set_one(Fp12 *a) {
 // It assumes `p` and `q` are correctly initialized and all 
 // p[i] and q[i] are respectively on G1 and G2 (it does not
 // check their memberships).
-void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) {
+void Fp12_multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) {
     // easier access pointer
     vec384fp6* res_vec = (vec384fp6*)res;
     // N_MAX is defined within BLST. It should represent a good tradeoff of the max number
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 75b9385d3ab..812319ced63 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -41,23 +41,23 @@ type pointE1 C.E1
 type pointE2 C.E2
 type scalar C.Fr
 
-// TODO: For now scalars are represented as field elements Fr since all scalars
-// are less than r - check if distinguishing two types in necessary
+// Note that scalars and field elements F_r are represented in Go by the same type
+// called `scalar`, which is internally represented by C type `Fr`. Scalars used by the
+// Go layer are all reduced modulo the curve order `r`.
 
 const (
 	// BLS12-381 related lengths imported from the C layer
 	frBytesLen = int(C.Fr_BYTES)
+	fpBytesLen = int(C.Fp_BYTES)
 	g1BytesLen = int(C.G1_SER_BYTES)
 	g2BytesLen = int(C.G2_SER_BYTES)
-	fpBytesLen = int(C.Fp_BYTES)
 
-	// more internal constants from the C layer
-	valid                   = C.VALID
-	invalid                 = C.INVALID
-	blst_valid              = int(C.BLST_SUCCESS)
-	blst_bad_encoding       = int(C.BLST_BAD_ENCODING)
-	blst_bad_scalar         = int(C.BLST_BAD_SCALAR)
-	blst_point_not_on_curve = int(C.BLST_POINT_NOT_ON_CURVE)
+	// error constants imported from the C layer
+	valid           = C.VALID
+	invalid         = C.INVALID
+	badEncoding     = C.BAD_ENCODING
+	badValue        = C.BAD_VALUE
+	pointNotOnCurve = C.POINT_NOT_ON_CURVE
 )
 
 func (a *scalar) String() string {
@@ -173,18 +173,17 @@ func readScalarFrStar(a *scalar, src []byte) error {
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
 
-	switch int(read) {
-	case blst_valid:
+	switch read {
+	case valid:
 		return nil
-	case blst_bad_encoding:
+	case badEncoding:
 		return invalidInputsErrorf("input length must be %d, got %d",
 			frBytesLen, len(src))
-	case blst_bad_scalar:
+	case badValue:
 		return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve")
 	default:
 		return invalidInputsErrorf("reading the scalar failed")
 	}
-
 }
 
 // readPointE2 reads a E2 point from a slice of bytes
@@ -196,12 +195,12 @@ func readPointE2(a *pointE2, src []byte) error {
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
 
-	switch int(read) {
-	case blst_valid:
+	switch read {
+	case valid:
 		return nil
-	case blst_bad_encoding, blst_bad_scalar:
+	case badEncoding, badValue:
 		return invalidInputsErrorf("input could not deserialize to a E2 point")
-	case blst_point_not_on_curve:
+	case pointNotOnCurve:
 		return invalidInputsErrorf("input is not a point on curve E2")
 	default:
 		return errors.New("reading E2 point failed")
@@ -217,12 +216,12 @@ func readPointE1(a *pointE1, src []byte) error {
 		(*C.uchar)(&src[0]),
 		(C.int)(len(src)))
 
-	switch int(read) {
-	case blst_valid:
+	switch read {
+	case valid:
 		return nil
-	case blst_bad_encoding, blst_bad_scalar:
+	case badEncoding, badValue:
 		return invalidInputsErrorf("input could not deserialize to a E1 point")
-	case blst_point_not_on_curve:
+	case pointNotOnCurve:
 		return invalidInputsErrorf("input is not a point on curve E1")
 	default:
 		return errors.New("reading E1 point failed")
@@ -263,7 +262,7 @@ func unsafeMapToG1(pt *pointE1, seed []byte) {
 // It generates a random point in E2\G2 and stores it in input point.
 func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool {
 	res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
-	return int(res) == blst_valid
+	return int(res) == valid
 }
 
 // unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files.
@@ -277,7 +276,7 @@ func unsafeMapToG2(pt *pointE2, seed []byte) {
 // It generates a random point in E2\G2 and stores it in input point.
 func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool {
 	res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
-	return int(res) == blst_valid
+	return int(res) == valid
 }
 
 // This is only a TEST function.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 48a7b1476de..921df90624d 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -11,10 +11,18 @@
 typedef uint8_t byte;
 typedef _Bool bool;  // assuming cgo is using a modern enough compiler
 
+// minimum targeted security level
 #define SEC_BITS  128
-#define VALID     0
-#define INVALID   1
-#define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR
+
+typedef enum {
+    VALID = 0,
+    INVALID,
+    BAD_ENCODING,
+    BAD_VALUE,
+    POINT_NOT_ON_CURVE,
+    POINT_NOT_IN_GROUP,
+    UNDEFINED,
+} ERROR;
 
 #define BITS_TO_BYTES(x) ((x+7)>>3)
 #define BITS_TO_LIMBS(x) ((x+63)>>6)
@@ -42,13 +50,10 @@ typedef _Bool bool;  // assuming cgo is using a modern enough compiler
 #define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
 #define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
 
-// BLS based SPoCK
-int     bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
-
 // Fr utilities
 extern const Fr BLS12_381_rR;
-bool      Fr_is_zero(const Fr* a);
-bool      Fr_is_equal(const Fr* a, const Fr* b);
+bool        Fr_is_zero(const Fr* a);
+bool        Fr_is_equal(const Fr* a, const Fr* b);
 void        Fr_set_limb(Fr*, const limb_t);
 void        Fr_copy(Fr*, const Fr*);
 void        Fr_set_zero(Fr*);
@@ -63,10 +68,10 @@ void        Fr_from_montg(Fr *res, const Fr *a);
 void        Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len);
 void        Fr_inv_montg_eucl(Fr *res, const Fr *a);
 void        Fr_inv_exp_montg(Fr *res, const Fr *a);
-BLST_ERROR  Fr_read_bytes(Fr* a, const byte *bin, int len);
-BLST_ERROR  Fr_star_read_bytes(Fr* a, const byte *bin, int len);
+ERROR       Fr_read_bytes(Fr* a, const byte *bin, int len);
+ERROR       Fr_star_read_bytes(Fr* a, const byte *bin, int len);
 void        Fr_write_bytes(byte *bin, const Fr* a);
-bool      map_bytes_to_Fr(Fr*, const byte*, int);
+bool        map_bytes_to_Fr(Fr*, const byte*, int);
 
 // Fp utilities
 void        Fp_mul_montg(Fp *, const Fp *, const Fp *);
@@ -74,34 +79,34 @@ void        Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
 void        E1_copy(E1*, const E1*);
-bool      E1_is_equal(const E1*, const E1*);
+bool        E1_is_equal(const E1*, const E1*);
 void        E1_set_infty(E1*);
-bool      E1_is_infty(const E1*);
+bool        E1_is_infty(const E1*);
 void        E1_to_affine(E1*, const E1*);
-bool      E1_affine_on_curve(const E1*);
-bool     E1_in_G1(const E1*);
+bool        E1_affine_on_curve(const E1*);
+bool        E1_in_G1(const E1*);
 void        E1_mult(E1*, const E1*, const Fr*);
 void        E1_add(E1*, const E1*, const E1*);
 void        E1_neg(E1*, const E1*);
 void        E1_sum_vector(E1*, const E1*, const int);
 int         E1_sum_vector_byte(byte*, const byte*, const int);
 void        G1_mult_gen(E1*, const Fr*);
-BLST_ERROR  E1_read_bytes(E1*, const byte *,  const int); 
+ERROR       E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
 void        unsafe_map_bytes_to_G1(E1*, const byte*, int);
-BLST_ERROR  unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
-// hash to curve functions (functions in bls12381_hashtocurve.c)
-#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
-int         map_to_G1(E1*, const byte*, const int);
+ERROR       unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
+
+#define     MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
+int         map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c
 
 // E2 and G2 utilities
 void        E2_set_infty(E2* p);
-bool      E2_is_infty(const E2*);
-bool      E2_affine_on_curve(const E2*);
-bool      E2_is_equal(const E2*, const E2*);
+bool        E2_is_infty(const E2*);
+bool        E2_affine_on_curve(const E2*);
+bool        E2_is_equal(const E2*, const E2*);
 void        E2_copy(E2*, const E2*);
 void        E2_to_affine(E2*, const E2*);
-BLST_ERROR  E2_read_bytes(E2*, const byte *,  const int); 
+ERROR       E2_read_bytes(E2*, const byte *,  const int); 
 void        E2_write_bytes(byte *, const E2*);
 void        G2_mult_gen(E2*, const Fr*);
 void        E2_mult(E2*, const E2*, const Fr*);
@@ -110,14 +115,14 @@ void        E2_add(E2* res, const E2* a, const E2* b);
 void        E2_neg(E2*, const E2*);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
-bool      E2_in_G2(const E2*);
+bool        E2_in_G2(const E2*);
 void        unsafe_map_bytes_to_G2(E2*, const byte*, int);
-BLST_ERROR  unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
+ERROR       unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // pairing and Fp12
-bool      Fp12_is_one(Fp12*);
+bool        Fp12_is_one(Fp12*);
 void        Fp12_set_one(Fp12*);
-void        multi_pairing(Fp12*, const E1*, const E2*, const int);
+void        Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int);
 
 // utility testing function
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index e1578a150fe..0771269ed86 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -47,7 +47,7 @@ static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) {
 
     // double pairing
     Fp12 e;
-    multi_pairing(&e, elemsG1, elemsG2, 2);
+    Fp12_multi_pairing(&e, elemsG1, elemsG2, 2);
     if (Fp12_is_one(&e)) {
         return VALID;
     }
@@ -80,7 +80,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
     if (!elemsG2) goto outG2;
 
     // elemsG1[0] = sig
-    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
         ret = INVALID;
         goto out;
     }
@@ -113,7 +113,7 @@ int bls_verifyPerDistinctMessage(const byte* sig,
 
     // multi pairing
     Fp12 e;
-    multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1);
+    Fp12_multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1);
     if (Fp12_is_one(&e)) {
         ret = VALID;
     } else {
@@ -154,7 +154,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
     if (!elemsG2) goto outG2;
 
     // elemsG1[0] = s
-    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) {
+    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
         ret = INVALID;
         goto out;
     }
@@ -206,7 +206,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
 
     // multi pairing
     Fp12 e;
-    multi_pairing(&e, elemsG1, elemsG2, nb_pks+1);
+    Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks+1);
     
     if (Fp12_is_one(&e)) {
         ret = VALID;
@@ -230,7 +230,7 @@ int bls_verifyPerDistinctKey(const byte* sig,
 int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) {  
     E1 s, h;
     // deserialize the signature into a curve point
-    if (E1_read_bytes(&s, sig, G1_SER_BYTES) != BLST_SUCCESS) {
+    if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) {
         return INVALID;
     }
 
@@ -381,7 +381,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
         // - valid points are multiplied by a random scalar (same for public keys at same index)
         // to make sure a signature at index (i) is verified against the public key at the same index.
         int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES);
-        if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) {
+        if (read_ret != VALID || !E1_in_G1(&sigs[i])) {
             // set signature and key to infinity (no effect on the aggregation tree)
             // and set result to invalid (result won't be overwritten)
             E2_set_infty(&pks[i]);
@@ -420,3 +420,47 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
 out_sigs:
     free(pks);
 }
+
+// Verifies the validity of 2 SPoCK proofs and 2 public keys.
+// Membership check in G1 of both proofs is verified in this function.
+// Membership check in G2 of both keys is not verified in this function.
+// the membership check in G2 is separated to allow optimizing multiple verifications 
+// using the same public keys.
+int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
+    E1 elemsG1[2];
+    E2 elemsG2[2];
+
+    // elemsG1[0] = s1
+    if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) {
+        return INVALID;
+    };
+    // check s1 is in G1
+    if (!E1_in_G1(&elemsG1[0]))  {
+        return INVALID;
+    }
+
+    // elemsG1[1] = s2
+    if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) {
+        return INVALID;
+    };
+    // check s2 is in G1
+    if (!E1_in_G1(&elemsG1[1]))  {
+        return INVALID;
+    }
+
+    // elemsG2[1] = pk1
+    E2_copy(&elemsG2[1], pk1);
+
+    // elemsG2[0] = -pk2
+    E2_neg(&elemsG2[0], pk2);
+
+    // double pairing
+    Fp12 e;
+    Fp12_multi_pairing(&e, elemsG1 , elemsG2, 2);
+
+    if (Fp12_is_one(&e)) {
+        return VALID; 
+    } 
+    return INVALID; 
+}
+
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 1ca61b376c4..2cbf91b2936 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -5,8 +5,7 @@
 
 #include "bls12381_utils.h"
 
-
-// bls core (functions in bls_core.c)
+// BLS signature core (functions in bls_core.c)
 int      bls_sign(byte*, const Fr*, const byte*, const int);
 int      bls_verify(const E2*, const byte*, const byte*, const int);
 int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
@@ -17,4 +16,7 @@ int      bls_verifyPerDistinctKey(const byte*,
 void     bls_batch_verify(const int, byte*, const E2*,
             const byte*, const byte*, const int, const byte*);
 
+// BLS based SPoCK
+int      bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
+
 #endif
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 6c99ae461e2..fdb21a986f1 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -5,12 +5,7 @@ import (
 	"errors"
 	"fmt"
 
-	_ "errors"
-
-	_ "fmt"
-
 	"github.com/onflow/flow-go/crypto/hash"
-	_ "github.com/onflow/flow-go/crypto/hash"
 )
 
 // BLS multi-signature using BLS12-381 curve
@@ -95,7 +90,6 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) {
 //   - (nil, error) if an unexpected error occurs
 //   - (aggregated_signature, nil) otherwise
 func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
-
 	// check for empty list
 	if len(sigs) == 0 {
 		return nil, blsAggregateEmptyListError
@@ -140,7 +134,6 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) {
 //   - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty)
 //   - (aggregated_key, nil) otherwise
 func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) {
-
 	// check for empty list
 	if len(keys) == 0 {
 		return nil, blsAggregateEmptyListError
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 78a87823b4c..e951cc9c33f 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -22,7 +22,7 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[]
 
     // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS)
     // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing.
-    #define MAX_IND_LOOPS   64/MAX_IND_BITS
+    #define MAX_IND_LOOPS   (64/MAX_IND_BITS)
     const int loops = MAX_IND_LOOPS;
     int k,j = 0;
     Fr tmp;
@@ -88,7 +88,7 @@ int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const
     E1* E1_shares = malloc(sizeof(E1) * len);
     for (int i=0; i < len; i++) {
         read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
-        if (read_ret != BLST_SUCCESS) {
+        if (read_ret != VALID) {
             goto out;
         }
     }
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 20c2fcad5df..e3c0bb9701a 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -5,11 +5,6 @@
 #include "point.h"
 #include "fields.h"
 #include "consts.h"
-#include "errors.h" 
-
-// TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last
-// enum value (eventually submit a fix to BLST)
-#define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1)
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 9966fbcfc37..15e8e0c48b3 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -63,16 +63,16 @@ void G2_vector_write_bytes(byte* out, const E2* A, const int len) {
 
 // The function imports an array of E2 points from a concatenated array of bytes.
 // The bytes array is supposed to be in (len * G2_SER_BYTES) 
-BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
+ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
     byte* p = (byte*) src;
     for (int i=0; i<len; i++){
         int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
-        if (read_ret != BLST_SUCCESS)
+        if (read_ret != VALID)
             return read_ret;
         p += G2_SER_BYTES;
     }
     // TODO: add G2 subgroup check?
-    return BLST_SUCCESS;
+    return VALID;
 }
 
 // checks the discrete log relationship in G2.
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index ac76469f962..0de83b43dc2 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -464,7 +464,7 @@ func readVerifVector(A []pointE2, src []byte) error {
 		(*C.E2)(&A[0]),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(A)))
-	if int(read) == blst_valid {
+	if read == valid {
 		return nil
 	}
 	// invalid A vector
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 8d3bdc7e1d7..c361d3ce861 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -7,7 +7,7 @@ void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int d
 void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, const byte x);
 void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg);
 void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
-BLST_ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
+ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
 bool      G2_check_log(const Fr* x, const E2* y);
 
 #endif

From 06c4ca75a6284a693b172a4ac6c2063f2cc1ef57 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 30 May 2023 17:22:21 -0600
Subject: [PATCH 124/200] update code base to work for G1 serialization defined
 as uncompressed

---
 crypto/bls.go                      | 13 +++-------
 crypto/bls12381_utils.c            | 40 +++++++++++-------------------
 crypto/bls12381_utils.go           | 35 +++++++++++++++++++++++---
 crypto/bls12381_utils.h            | 15 +++++------
 crypto/bls12381_utils_test.go      | 27 ++++++++------------
 crypto/bls_include.h               |  4 +--
 crypto/bls_multisig.go             |  2 +-
 crypto/bls_test.go                 | 28 ++++++++++++++-------
 crypto/bls_thresholdsign_include.h |  4 +--
 crypto/dkg_include.h               |  4 +--
 crypto/sign.go                     |  7 +++---
 crypto/spock_test.go               |  4 +--
 12 files changed, 99 insertions(+), 84 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 93dd487a817..7f884a73c49 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -207,11 +207,6 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher)
 	}
 }
 
-// 0xC0 is the header of the point at infinity serialization (either in G1 or G2)
-const infinityPointHeader = byte(0xC0)
-
-var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, SignatureLenBLSBLS12381-1)...)
-
 // IsBLSSignatureIdentity checks whether the input signature is
 // the identity signature (point at infinity in G1).
 //
@@ -221,7 +216,7 @@ var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, Sign
 // suspected to be equal to identity, which avoids failing the aggregated
 // signature verification.
 func IsBLSSignatureIdentity(s Signature) bool {
-	return bytes.Equal(s, identityBLSSignature)
+	return bytes.Equal(s, g1Serialization)
 }
 
 // generatePrivateKey deterministically generates a private key for BLS on BLS12-381 curve.
@@ -347,8 +342,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err
 // decodePublicKeyCompressed decodes a slice of bytes into a public key.
 // since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed
 func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) {
-	// in compression mode, g2BytesLen is equal to 2 * Fp_bytes
-	if g2BytesLen != 2*fpBytesLen {
+	if !isG2Compressed() {
 		panic("library is not configured to use compressed public key serialization")
 	}
 	return a.decodePublicKey(publicKeyBytes)
@@ -478,8 +472,7 @@ func (pk *pubKeyBLSBLS12381) Size() int {
 // The encoding is a compressed encoding of the point
 // [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-
 func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
-	// in compression mode, g2BytesLen is equal to 2 * Fp_bytes
-	if g2BytesLen != 2*fpBytesLen {
+	if !isG2Compressed() {
 		panic("library is not configured to use compressed public key serialization")
 	}
 	dest := make([]byte, g2BytesLen)
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 35bf1ff4686..30f8b862aa0 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -543,6 +543,7 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     if (ret != VALID) {
         return ret;
     }
+    Fp_to_montg(&a->x, &a->x);
 
     // set a.z to 1
     Fp_copy(&a->z, &BLS12_381_pR);
@@ -552,6 +553,7 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
         if (ret != VALID){ 
             return ret;
         }
+        Fp_to_montg(&a->y, &a->y);
         // check read point is on curve
         if (!E1_affine_on_curve(a)) { 
             return POINT_NOT_ON_CURVE;
@@ -560,12 +562,12 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
     }
     
     // compute the possible square root
-    Fp_to_montg(&a->x, &a->x);
     Fp_squ_montg(&a->y, &a->x);
     Fp_mul_montg(&a->y, &a->y, &a->x);    // x^3
     Fp_add(&a->y, &a->y, &B_E1);          // B_E1 is already in Montg form             
-    if (!Fp_sqrt_montg(&a->y, &a->y))     // check whether x^3+b is a quadratic residue
+    if (!Fp_sqrt_montg(&a->y, &a->y)) {    // check whether x^3+b is a quadratic residue
         return POINT_NOT_ON_CURVE; 
+    }
 
     // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
     if (Fp_get_sign(&a->y) != y_sign) {
@@ -718,30 +720,18 @@ void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
     G1_mult_gen(p, &log);
 }
 
-// generates a point in E1\G1 and stores it in p
+// maps bytes to a point in E1\G1. 
+// `len` must be at least 96 bytes.
 // this is a testing file only, should not be used in any protocol!
-ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
-    assert(G1_SERIALIZATION == COMPRESSED);
-    assert(len >= G1_SER_BYTES);
-
-    // attempt to deserilize a compressed E1 point from input bytes
-    // after fixing the header 2 bits
-    byte copy[G1_SER_BYTES];
-    memcpy(copy, bytes, sizeof(copy));
-    copy[0] |= 1<<7;        // set compression bit
-    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
-
-    ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES);
-    if (ser != VALID) {
-        return ser;
-    }
-
-    // map the point to E2\G2 by clearing G2 order
-    E1_mult(p, p, (const Fr*)BLS12_381_r);
-    E1_to_affine(p, p);
-
-    assert(E1_affine_on_curve(p));  // sanity check to make sure p is in E2
-    return VALID;
+void unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
+    assert(len >= 96);
+    Fp u;
+    map_96_bytes_to_Fp(&u, bytes, 96);
+    // map to E1's isogenous and then to E1
+    map_to_isogenous_E1((POINTonE1 *)p, u);
+    isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p);
+    // clear G1 order
+    E1_mult(p, p, (Fr*)&BLS12_381_r);
 }
 
 // ------------------- E2 utilities
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 812319ced63..b9535d39955 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -60,6 +60,28 @@ const (
 	pointNotOnCurve = C.POINT_NOT_ON_CURVE
 )
 
+// header of the point at infinity serializations
+var g1SerHeader byte // g1
+var g2SerHeader byte // g2
+
+// `g1“ serialization
+var g1Serialization []byte
+
+// initialization of BLS12-381 curve
+func initBLS12381() {
+	if isG1Compressed() {
+		g1SerHeader = 0xC0
+	} else {
+		g1SerHeader = 0x40
+	}
+	g1Serialization = append([]byte{g1SerHeader}, make([]byte, g1BytesLen-1)...)
+	if isG2Compressed() {
+		g2SerHeader = 0xC0
+	} else {
+		g2SerHeader = 0x40
+	}
+}
+
 func (a *scalar) String() string {
 	encoding := make([]byte, frBytesLen)
 	writeScalar(encoding, a)
@@ -260,9 +282,8 @@ func unsafeMapToG1(pt *pointE1, seed []byte) {
 
 // unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool {
-	res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
-	return int(res) == valid
+func unsafeMapToG1Complement(pt *pointE1, seed []byte) {
+	C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
 // unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files.
@@ -307,3 +328,11 @@ func hashToG1Bytes(data, dst []byte) []byte {
 	writePointE1(pointBytes, &point)
 	return pointBytes
 }
+
+func isG1Compressed() bool {
+	return g1BytesLen == fpBytesLen
+}
+
+func isG2Compressed() bool {
+	return g2BytesLen == 2*fpBytesLen
+}
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 921df90624d..ccbb4c9655c 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -2,8 +2,8 @@
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-#ifndef _REL_MISC_INCLUDE_H
-#define _REL_MISC_INCLUDE_H
+#ifndef _BLS12_381_UTILS_H
+#define _BLS12_381_UTILS_H
 
 #include <string.h>
 #include "blst_include.h"
@@ -43,8 +43,8 @@ typedef enum {
 #define G2_BYTES (2*Fp2_BYTES)
 
 // Compressed and uncompressed points
-#define COMPRESSED      1
-#define UNCOMPRESSED    0
+#define COMPRESSED          1
+#define UNCOMPRESSED        0
 #define G1_SERIALIZATION    (COMPRESSED)
 #define G2_SERIALIZATION    (COMPRESSED)
 #define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
@@ -94,7 +94,7 @@ void        G1_mult_gen(E1*, const Fr*);
 ERROR       E1_read_bytes(E1*, const byte *,  const int); 
 void        E1_write_bytes(byte *, const E1*);
 void        unsafe_map_bytes_to_G1(E1*, const byte*, int);
-ERROR       unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
+void        unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
 
 #define     MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
 int         map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c
@@ -130,6 +130,7 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int);
 // Debugging related functions
 #define DEBUG 0
 #if (DEBUG == 1)
+#include <stdio.h>
 void     bytes_print_(char*, byte*, int);
 void     Fr_print_(char*, Fr*);
 void     Fp_print_(char*, const Fp*);
@@ -137,6 +138,6 @@ void     Fp2_print_(char*, const Fp2*);
 void     Fp12_print_(char*, const Fp12*);
 void     E1_print_(char*, const E1*, const int);
 void     E2_print_(char*, const E2*, const int);
-#endif // DEBUG
+#endif /* DEBUG */
 
-#endif
\ No newline at end of file
+#endif /* BLS12_381_UTILS */
\ No newline at end of file
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 2c9d76bbbe5..7741238278e 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -139,12 +139,7 @@ func TestSubgroupCheck(t *testing.T) {
 		unsafeMapToG1(&p, seed) // point in G1
 		assert.True(t, checkMembershipG1(&p))
 
-		inG1 := false
-		for !inG1 {
-			_, err := prg.Read(seed)
-			require.NoError(t, err)
-			inG1 = unsafeMapToG1Complement(&p, seed) // point in E2\G2
-		}
+		unsafeMapToG1Complement(&p, seed) // point in E2\G2
 		assert.False(t, checkMembershipG1(&p))
 	})
 
@@ -198,8 +193,8 @@ func TestReadWriteG1(t *testing.T) {
 	bytes := make([]byte, g1BytesLen)
 	// generate a random G1 point, encode it, decode it,
 	// and compare it the original point
-	iterations := 50
 	t.Run("random points", func(t *testing.T) {
+		iterations := 50
 		for i := 0; i < iterations; i++ {
 			var p, q pointE1
 			_, err := prg.Read(seed)
@@ -213,16 +208,14 @@ func TestReadWriteG1(t *testing.T) {
 	})
 
 	t.Run("infinity", func(t *testing.T) {
-		for i := 0; i < iterations; i++ {
-			var p, q pointE1
-			seed := make([]byte, frBytesLen)
-			unsafeMapToG1(&p, seed) // this results in the infinity point
-			writePointE1(bytes, &p)
-			require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check
-			err := readPointE1(&q, bytes)
-			require.NoError(t, err)
-			assert.True(t, p.equals(&q))
-		}
+		var p, q pointE1
+		seed := make([]byte, frBytesLen)
+		unsafeMapToG1(&p, seed) // this results in the infinity point
+		writePointE1(bytes, &p)
+		require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check
+		err := readPointE1(&q, bytes)
+		require.NoError(t, err)
+		assert.True(t, p.equals(&q))
 	})
 }
 
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index 2cbf91b2936..c5dba4d45de 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -1,7 +1,7 @@
 // this file is about the core functions required by the BLS signature scheme
 
-#ifndef _REL_BLS_INCLUDE_H
-#define _REL_BLS_INCLUDE_H
+#ifndef _BLS_INCLUDE_H
+#define _BLS_INCLUDE_H
 
 #include "bls12381_utils.h"
 
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index fdb21a986f1..7adbb0c1f45 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -507,7 +507,7 @@ func BatchVerifyBLSSignaturesOneMessage(
 			// However, the boolean return for index `i` is set to `false` and won't be overwritten.
 			returnBool[i] = false
 			pkPoints = append(pkPoints, getIdentityPoint())
-			flatSigs = append(flatSigs, identityBLSSignature...)
+			flatSigs = append(flatSigs, g1Serialization...)
 		} else {
 			pkPoints = append(pkPoints, pkBLS.point)
 			flatSigs = append(flatSigs, sigs[i]...)
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index d8561ccc5f6..7ea369a5b73 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -29,6 +29,9 @@ func TestBLSMainMethods(t *testing.T) {
 	//  - signature decoding only accepts reduced x-coordinates to avoid signature malleability
 
 	t.Run("invalid x coordinate larger than p", func(t *testing.T) {
+		if !isG1Compressed() {
+			t.Skip()
+		}
 		msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7")
 		require.NoError(t, err)
 		validSig, err := hex.DecodeString("80b0cac2a0f4f8881913edf2b29065675dfed6f6f4e17e9b5d860a845d4e7d476b277d06a493b81482e63d8131f9f2fa")
@@ -190,7 +193,7 @@ func TestBLSEncodeDecode(t *testing.T) {
 	t.Run("infinity public key", func(t *testing.T) {
 		//  decode an identity public key
 		pkBytes := make([]byte, PubKeyLenBLSBLS12381)
-		pkBytes[0] = infinityPointHeader
+		pkBytes[0] = g2SerHeader
 		pk, err := DecodePublicKey(BLSBLS12381, pkBytes)
 		require.NoError(t, err, "decoding identity public key should succeed")
 		assert.True(t, pk.Equals(IdentityBLSPublicKey()))
@@ -543,12 +546,15 @@ func TestBLSAggregatePublicKeys(t *testing.T) {
 		assert.True(t, blsKey.isIdentity)
 		// check of encoding header
 		pkBytes := aggPK.Encode()
-		assert.Equal(t, infinityPointHeader, pkBytes[0])
+		assert.Equal(t, g2SerHeader, pkBytes[0])
 	})
 
 	t.Run("Identity public key from opposite points", func(t *testing.T) {
+		if !isG2Compressed() {
+			t.Skip()
+		}
 		pkBytes := pks[0].Encode()
-		negatePoint(pkBytes)
+		negateCompressedPoint(pkBytes)
 		minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes)
 		require.NoError(t, err)
 		// aggregated public keys
@@ -561,7 +567,7 @@ func TestBLSAggregatePublicKeys(t *testing.T) {
 		assert.True(t, blsKey.isIdentity)
 		// check of encoding header
 		pkBytes = aggPK.Encode()
-		assert.Equal(t, infinityPointHeader, pkBytes[0])
+		assert.Equal(t, g2SerHeader, pkBytes[0])
 	})
 }
 
@@ -822,9 +828,9 @@ func TestBLSBatchVerify(t *testing.T) {
 }
 
 // Utility function that flips a point sign bit to negate the point
-// this is shortcut which works only for zcash BLS12-381 compressed serialization
-// Applicable to both signatures and public keys
-func negatePoint(pointbytes []byte) {
+// this is shortcut which works only for zcash BLS12-381 compressed serialization.
+// Applicable to both signatures and public keys.
+func negateCompressedPoint(pointbytes []byte) {
 	pointbytes[0] ^= 0x20
 }
 
@@ -1190,10 +1196,14 @@ func TestBLSIdentity(t *testing.T) {
 	hasher := NewExpandMsgXOFKMAC128("")
 
 	t.Run("identity signature comparison", func(t *testing.T) {
+		if !isG1Compressed() {
+			t.Skip()
+		}
 		// verify that constructed identity signatures are recognized as such by IsBLSSignatureIdentity.
 		// construct identity signature by summing (aggregating) a random signature and its inverse.
 
-		assert.True(t, IsBLSSignatureIdentity(identityBLSSignature))
+		// sanity check to start
+		assert.True(t, IsBLSSignatureIdentity(g1Serialization))
 
 		// sum up a random signature and its inverse to get identity
 		sk := randomSK(t, rand)
@@ -1201,7 +1211,7 @@ func TestBLSIdentity(t *testing.T) {
 		require.NoError(t, err)
 		oppositeSig := make([]byte, SignatureLenBLSBLS12381)
 		copy(oppositeSig, sig)
-		negatePoint(oppositeSig)
+		negateCompressedPoint(oppositeSig)
 		aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig})
 		require.NoError(t, err)
 		assert.True(t, IsBLSSignatureIdentity(aggSig))
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 3937f8ce965..7c27c3b97b8 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -1,5 +1,5 @@
-#ifndef _REL_THRESHOLD_INCLUDE_H
-#define _REL_THRESHOLD_INCLUDE_H
+#ifndef _THRESHOLD_INCLUDE_H
+#define _THRESHOLD_INCLUDE_H
 
 #include "bls_include.h"
 
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index c361d3ce861..7cd2b8b7d2d 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -1,5 +1,5 @@
-#ifndef _REL_DKG_INCLUDE_H
-#define _REL_DKG_INCLUDE_H
+#ifndef _DKG_INCLUDE_H
+#define _DKG_INCLUDE_H
 
 #include "bls12381_utils.h"
 
diff --git a/crypto/sign.go b/crypto/sign.go
index ff4348f3b09..d400898d97d 100644
--- a/crypto/sign.go
+++ b/crypto/sign.go
@@ -65,19 +65,18 @@ func newSigner(algo SigningAlgorithm) (signer, error) {
 
 // Initialize the context of all algos
 func init() {
-	// P-256
+	// ECDSA
 	p256Instance = &(ecdsaAlgo{
 		curve: elliptic.P256(),
 		algo:  ECDSAP256,
 	})
-
-	// secp256k1
 	secp256k1Instance = &(ecdsaAlgo{
 		curve: btcec.S256(),
 		algo:  ECDSASecp256k1,
 	})
 
-	// bls12-381
+	// BLS
+	initBLS12381()
 	blsInstance = &blsBLS12381Algo{
 		algo: BLSBLS12381,
 	}
diff --git a/crypto/spock_test.go b/crypto/spock_test.go
index 75de3dea838..59498a42f6f 100644
--- a/crypto/spock_test.go
+++ b/crypto/spock_test.go
@@ -69,7 +69,7 @@ func TestSPOCKProveVerifyAgainstData(t *testing.T) {
 	t.Run("identity proof", func(t *testing.T) {
 		// verifying with a pair of (proof, publicKey) equal to (identity_signature, identity_key) should
 		// return false
-		identityProof := identityBLSSignature
+		identityProof := g1Serialization
 		result, err := SPOCKVerifyAgainstData(IdentityBLSPublicKey(), identityProof, data, kmac)
 		assert.NoError(t, err)
 		assert.False(t, result)
@@ -166,7 +166,7 @@ func TestSPOCKProveVerify(t *testing.T) {
 	t.Run("identity proof", func(t *testing.T) {
 		// verifying with either pair of (proof, publicKey) equal to (identity_signature, identity_key) should
 		// return falsen with any other (proof, key) pair.
-		identityProof := identityBLSSignature
+		identityProof := g1Serialization
 		result, err := SPOCKVerify(IdentityBLSPublicKey(), identityProof, sk2.PublicKey(), pr2)
 		assert.NoError(t, err)
 		assert.False(t, result)

From 9ed47f1975689db2a296c6c22da0e446d7a0158c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 30 May 2023 18:23:51 -0600
Subject: [PATCH 125/200] update code base to work for G2 serialization defined
 as uncompressed

---
 crypto/bls.go                 | 16 +++++-----
 crypto/bls12381_utils.c       | 55 +++++++++++++----------------------
 crypto/bls12381_utils.go      |  5 ++--
 crypto/bls12381_utils.h       |  2 +-
 crypto/bls12381_utils_test.go | 18 +++++++-----
 crypto/bls_test.go            |  5 +++-
 crypto/sign_test_utils.go     | 26 +++++++++--------
 7 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 7f884a73c49..447a203033b 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -286,7 +286,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) {
 const invalidBLSSignatureHeader = byte(0xE0)
 
 // BLSInvalidSignature returns an invalid signature that fails when verified
-// with any message and public key.
+// with any message and public key, which can be used for testing.
 //
 // The signature bytes represent an invalid serialization of a point which
 // makes the verification fail early. The verification would return (false, nil).
@@ -475,15 +475,17 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte {
 	if !isG2Compressed() {
 		panic("library is not configured to use compressed public key serialization")
 	}
-	dest := make([]byte, g2BytesLen)
-	writePointE2(dest, &a.point)
-	return dest
+	return a.Encode()
 }
 
-// Encode returns a byte encoding of the public key.
-// Since we use a compressed encoding by default, this delegates to EncodeCompressed
+// Encode returns a byte encoding of the public key (a G2 point).
+// The current encoding is a compressed serialization of G2 following [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-
+//
+// The function should evolve in the future to support uncompressed compresion too.
 func (a *pubKeyBLSBLS12381) Encode() []byte {
-	return a.EncodeCompressed()
+	dest := make([]byte, g2BytesLen)
+	writePointE2(dest, &a.point)
+	return dest
 }
 
 // Equals checks is two public keys are equal
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 30f8b862aa0..d88bfa3aaa8 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -792,17 +792,23 @@ ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     if (ret != VALID) {
         return ret;
     }
+    Fp2* a_x = &(a->x);
+    Fp_to_montg(&real(a_x), &real(a_x));
+    Fp_to_montg(&imag(a_x), &imag(a_x));
 
     // set a.z to 1
     Fp2* a_z = &(a->z);
     Fp_copy(&real(a_z), &BLS12_381_pR);
     Fp_set_zero(&imag(a_z));   
 
+    Fp2* a_y = &(a->y);
     if (G2_SERIALIZATION == UNCOMPRESSED) {
-        ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y));
+        ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y));
         if (ret != VALID){ 
             return ret;
         }
+        Fp_to_montg(&real(a_y), &real(a_y));
+        Fp_to_montg(&imag(a_y), &imag(a_y));
         // check read point is on curve
         if (!E2_affine_on_curve(a)) { 
             return POINT_NOT_ON_CURVE;
@@ -811,14 +817,9 @@ ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
     }
     
     // compute the possible square root
-    Fp2* a_x = &(a->x);
-    Fp_to_montg(&real(a_x), &real(a_x));
-    Fp_to_montg(&imag(a_x), &imag(a_x));
-
-    Fp2* a_y = &(a->y);
     Fp2_squ_montg(a_y, a_x);
-    Fp2_mul_montg(a_y, a_y, a_x);
-    Fp2_add(a_y, a_y, &B_E2);          // B_E2 is already in Montg form             
+    Fp2_mul_montg(a_y, a_y, a_x);     // x^3
+    Fp2_add(a_y, a_y, &B_E2);         // B_E2 is already in Montg form             
     if (!Fp2_sqrt_montg(a_y, a_y))    // check whether x^3+b is a quadratic residue
         return POINT_NOT_ON_CURVE; 
 
@@ -976,33 +977,19 @@ void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
     G2_mult_gen(p, &log);
 }
 
-// attempts to map `bytes` to a point in E2\G2 and stores it in p.
-// `len` should be at least G2_SER_BYTES. It returns VALID only if mapping 
-// succeeds.
-// For now, function only works when E2 serialization is compressed.
+// maps `bytes` to a point in E2\G2 and stores it in p.
+// `len` should be at least 192. 
 // this is a testing tool only, it should not be used in any protocol!
-ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
-    assert(G2_SERIALIZATION == COMPRESSED);
-    assert(len >= G2_SER_BYTES);
-
-    // attempt to deserilize a compressed E2 point from input bytes
-    // after fixing the header 2 bits
-    byte copy[G2_SER_BYTES];
-    memcpy(copy, bytes, sizeof(copy));
-    copy[0] |= 1<<7;        // set compression bit
-    copy[0] &= ~(1<<6);     // clear infinity bit - point is not infinity
-
-    ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES);
-    if (ser != VALID) {
-        return ser;
-    }
-
-    // map the point to E2\G2 by clearing G2 order
-    E2_mult(p, p, (const Fr*)BLS12_381_r);
-    E2_to_affine(p, p);
-
-    assert(E2_affine_on_curve(p));  // sanity check to make sure p is in E2
-    return VALID;
+void unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
+    assert(len >= 192);
+    Fp2 u;
+    map_96_bytes_to_Fp(&real(&u), bytes, 96);
+    map_96_bytes_to_Fp(&imag(&u), bytes+96, 96);
+    // map to E2's isogenous and then to E2
+    map_to_isogenous_E2((POINTonE2 *)p, u);
+    isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p);
+    // clear G2 order
+    E2_mult(p, p, (Fr*)&BLS12_381_r);
 }
 
 // ------------------- Pairing utilities 
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index b9535d39955..87a515f3b31 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -295,9 +295,8 @@ func unsafeMapToG2(pt *pointE2, seed []byte) {
 
 // unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files.
 // It generates a random point in E2\G2 and stores it in input point.
-func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool {
-	res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
-	return int(res) == valid
+func unsafeMapToG2Complement(pt *pointE2, seed []byte) {
+	C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed)))
 }
 
 // This is only a TEST function.
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index ccbb4c9655c..b9c8ab755a7 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -117,7 +117,7 @@ void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
 bool        E2_in_G2(const E2*);
 void        unsafe_map_bytes_to_G2(E2*, const byte*, int);
-ERROR       unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
+void        unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
 
 // pairing and Fp12
 bool        Fp12_is_one(Fp12*);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 7741238278e..067ac979f7e 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -22,6 +22,9 @@ func TestScalarMultBLS12381(t *testing.T) {
 	// Note that generator and random point multiplications
 	// are implemented with the same algorithm
 	t.Run("G1", func(t *testing.T) {
+		if !isG1Compressed() {
+			t.Skip()
+		}
 		var p pointE1
 		generatorScalarMultG1(&p, &expo)
 		expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4")
@@ -35,6 +38,9 @@ func TestScalarMultBLS12381(t *testing.T) {
 	// Note that generator and random point multiplications
 	// are implemented with the same algorithm
 	t.Run("G2", func(t *testing.T) {
+		if !isG2Compressed() {
+			t.Skip()
+		}
 		var p pointE2
 		generatorScalarMultG2(&p, &expo)
 		expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e")
@@ -81,6 +87,9 @@ func BenchmarkScalarMult(b *testing.B) {
 
 // Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve
 func TestMapToG1(t *testing.T) {
+	if !isG1Compressed() {
+		t.Skip()
+	}
 	// test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1
 	dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_")
 
@@ -130,7 +139,7 @@ func BenchmarkMapToG1(b *testing.B) {
 // test subgroup membership check in G1 and G2
 func TestSubgroupCheck(t *testing.T) {
 	prg := getPRG(t)
-	seed := make([]byte, g2BytesLen)
+	seed := make([]byte, 192)
 	_, err := prg.Read(seed)
 	require.NoError(t, err)
 
@@ -148,12 +157,7 @@ func TestSubgroupCheck(t *testing.T) {
 		unsafeMapToG2(&p, seed) // point in G2
 		assert.True(t, checkMembershipG2(&p))
 
-		inG2 := false
-		for !inG2 {
-			_, err := prg.Read(seed)
-			require.NoError(t, err)
-			inG2 = unsafeMapToG2Complement(&p, seed) // point in E2\G2
-		}
+		unsafeMapToG2Complement(&p, seed) // point in E2\G2
 		assert.False(t, checkMembershipG2(&p))
 	})
 }
diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 7ea369a5b73..4047967be9b 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -29,7 +29,7 @@ func TestBLSMainMethods(t *testing.T) {
 	//  - signature decoding only accepts reduced x-coordinates to avoid signature malleability
 
 	t.Run("invalid x coordinate larger than p", func(t *testing.T) {
-		if !isG1Compressed() {
+		if !isG1Compressed() || !isG2Compressed() {
 			t.Skip()
 		}
 		msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7")
@@ -221,6 +221,9 @@ func TestBLSEncodeDecode(t *testing.T) {
 	// may implicitely rely on the property.
 
 	t.Run("public key with non-reduced coordinates", func(t *testing.T) {
+		if !isG2Compressed() {
+			t.Skip()
+		}
 		// valid pk with x[0] < p and x[1] < p
 		validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2")
 		require.NoError(t, err)
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 8362df83c7f..06179a01989 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	mrand "math/rand"
 	"testing"
-	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -14,7 +13,7 @@ import (
 )
 
 func getPRG(t *testing.T) *mrand.Rand {
-	random := time.Now().UnixNano()
+	random := int64(1685491239186156000) //time.Now().UnixNano()
 	t.Logf("rng seed is %d", random)
 	rng := mrand.New(mrand.NewSource(random))
 	return rng
@@ -186,13 +185,13 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 				skCheckBytes := skCheck.Encode()
 				assert.Equal(t, skBytes, skCheckBytes, "keys should be equal")
 				distinctSkBytes := distinctSk.Encode()
-				assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different")
+				assert.NotEqual(t, skBytes, distinctSkBytes)
 
 				// check public key encoding
 				pk := sk.PublicKey()
 				pkBytes := pk.Encode()
 				pkCheck, err := DecodePublicKey(salg, pkBytes)
-				require.Nil(t, err, "the key decoding failed")
+				require.Nil(t, err)
 				assert.True(t, pk.Equals(pkCheck), "key equality check failed")
 				pkCheckBytes := pkCheck.Encode()
 				assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal")
@@ -200,14 +199,17 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 				assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different")
 
 				// same for the compressed encoding
-				pkComprBytes := pk.EncodeCompressed()
-				pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes)
-				require.Nil(t, err, "the key decoding failed")
-				assert.True(t, pk.Equals(pkComprCheck), "key equality check failed")
-				pkCheckComprBytes := pkComprCheck.EncodeCompressed()
-				assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal")
-				distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed()
-				assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different")
+				// skip is BLS is used and compression isn't supported
+				if !(salg == BLSBLS12381 && !isG2Compressed()) {
+					pkComprBytes := pk.EncodeCompressed()
+					pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes)
+					require.Nil(t, err, "the key decoding failed")
+					assert.True(t, pk.Equals(pkComprCheck), "key equality check failed")
+					pkCheckComprBytes := pkComprCheck.EncodeCompressed()
+					assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal")
+					distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed()
+					assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different")
+				}
 			}
 		})
 

From 01b64c560343ef605c7d4737c802c2b7ac2b2ac7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 30 May 2023 21:34:18 -0600
Subject: [PATCH 126/200] make sure older compilers recognize uintx_t

---
 crypto/bls12381_utils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index b9c8ab755a7..d35e0298c59 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -6,6 +6,7 @@
 #define _BLS12_381_UTILS_H
 
 #include <string.h>
+#include <stdint.h>
 #include "blst_include.h"
 
 typedef uint8_t byte;

From 7b0a25ec8190b67202b9a4180a13363350e31937 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 30 May 2023 21:40:31 -0600
Subject: [PATCH 127/200] update crypto/Makefile go command

---
 crypto/Makefile | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index a75e00df15b..3fa010ca6ae 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -10,20 +10,24 @@ else
 	RACE_FLAG :=
 endif
 
+# the crypto package uses BLST source files underneath which may use ADX insructions
 ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+ifeq ($(ADX_SUPPORT), 1)
+# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+	CGO_FLAG :=
+else
+# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+endif
 
 # test all packages
 .PHONY: test
 test:
-# root package (it uses BLST source files underneath which requires testing for ADX support)
-ifeq ($(ADX_SUPPORT), 1)
-	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
-else
-	CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
-endif
+# root package
+	$(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,)
 # sub packages
-	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash
-	go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random
+	$(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash
+	$(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random
 
 .PHONY: docker-build
 docker-build:

From 00f66d6c2f17ab095358f1420c9b8c5b089902c9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 30 May 2023 21:43:08 -0600
Subject: [PATCH 128/200] package default build uses ADX

---
 crypto/bls12381_utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 87a515f3b31..a01c46e05b8 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,7 +4,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"

From e998ab6eebc5630035c476d1d202f291da864ec6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 31 May 2023 17:08:47 -0600
Subject: [PATCH 129/200] add ADX detection and cgo flags for all go commands
 in Makefile/Dockerfile

---
 Makefile                                    | 82 +++++++++++++--------
 cmd/Dockerfile                              |  6 +-
 crypto/Makefile                             | 12 ++-
 insecure/Makefile                           | 20 ++++-
 integration/Makefile                        | 42 ++++++++---
 integration/benchmark/cmd/manual/Dockerfile |  5 +-
 6 files changed, 117 insertions(+), 50 deletions(-)

diff --git a/Makefile b/Makefile
index cd402f40f1e..8fcb8fa3ecb 100644
--- a/Makefile
+++ b/Makefile
@@ -42,11 +42,29 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging
 export CONTAINER_REGISTRY := gcr.io/flow-container-registry
 export DOCKER_BUILDKIT := 1
 
+# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
+ifeq ($(shell uname -s),Linux)
+# detect ADX support on the CURRENT linux machine.
+	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+else
+# on non-linux machines, set the flag to 1 by default
+	ADX_SUPPORT := 1
+endif
+
+# the crypto package uses BLST source files underneath which may use ADX insructions.
+ifeq ($(ADX_SUPPORT), 1)
+# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+	CGO_FLAG :=
+else
+# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+endif
+
 cmd/collection/collection:
-	go build -o cmd/collection/collection cmd/collection/main.go
+	$(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go
 
 cmd/util/util:
-	go build -o cmd/util/util cmd/util/main.go
+	$(CGO_FLAG) go build -o cmd/util/util cmd/util/main.go
 
 .PHONY: update-core-contracts-version
 update-core-contracts-version:
@@ -58,7 +76,7 @@ update-core-contracts-version:
 .PHONY: unittest-main
 unittest-main:
 	# test all packages
-	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES)
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES)
 
 .PHONY: install-mock-generators
 install-mock-generators:
@@ -83,7 +101,7 @@ verify-mocks: generate-mocks
 .PHONY: fuzz-fvm
 fuzz-fvm:
 	# run fuzz tests in the fvm package
-	cd ./fvm && go test -fuzz=Fuzz -run ^$$
+	cd ./fvm && $(CGO_FLAG) go test -fuzz=Fuzz -run ^$$
 
 .PHONY: test
 test: verify-mocks unittest-main
@@ -121,7 +139,7 @@ generate-proto:
 
 .PHONY: generate-fvm-env-wrappers
 generate-fvm-env-wrappers:
-	go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go
+	$(CGO_FLAG) go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go
 
 .PHONY: generate-mocks
 generate-mocks: install-mock-generators
@@ -241,59 +259,59 @@ docker-ci-integration:
 
 .PHONY: docker-build-collection
 docker-build-collection:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/collection:latest" -t "$(CONTAINER_REGISTRY)/collection:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/collection:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-collection-without-netgo
 docker-build-collection-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)"  .
 
 .PHONY: docker-build-collection-debug
 docker-build-collection-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/collection-debug:latest" -t "$(CONTAINER_REGISTRY)/collection-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection-debug:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-consensus
 docker-build-consensus:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/consensus:latest" -t "$(CONTAINER_REGISTRY)/consensus:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/consensus:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-consensus-without-netgo
 docker-build-consensus-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-consensus-debug
 docker-build-consensus-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/consensus-debug:latest" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-execution
 docker-build-execution:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/execution:latest" -t "$(CONTAINER_REGISTRY)/execution:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/execution:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-execution-without-netgo
 docker-build-execution-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-execution-debug
 docker-build-execution-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/execution-debug:latest" -t "$(CONTAINER_REGISTRY)/execution-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-debug:$(IMAGE_TAG)" .
 
 # build corrupt execution node for BFT testing
@@ -301,28 +319,28 @@ docker-build-execution-debug:
 docker-build-execution-corrupt:
 	# temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/execution-corrupted:latest" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-verification
 docker-build-verification:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/verification:latest" -t "$(CONTAINER_REGISTRY)/verification:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/verification:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-verification-without-netgo
 docker-build-verification-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-verification-debug
 docker-build-verification-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/verification-debug:latest" -t "$(CONTAINER_REGISTRY)/verification-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-debug:$(IMAGE_TAG)" .
 
 # build corrupt verification node for BFT testing
@@ -330,28 +348,28 @@ docker-build-verification-debug:
 docker-build-verification-corrupt:
 	# temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/verification-corrupted:latest" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-access
 docker-build-access:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/access:latest" -t "$(CONTAINER_REGISTRY)/access:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/access:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-access-without-netgo
 docker-build-access-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-access-debug
 docker-build-access-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access  --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access  --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/access-debug:latest" -t "$(CONTAINER_REGISTRY)/access-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-debug:$(IMAGE_TAG)" .
 
 # build corrupt access node for BFT testing
@@ -359,21 +377,21 @@ docker-build-access-debug:
 docker-build-access-corrupt:
 	#temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/access-corrupted:latest" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-observer
 docker-build-observer:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/observer:latest" -t "$(CONTAINER_REGISTRY)/observer:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-observer-without-netgo
 docker-build-observer-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" .
@@ -381,18 +399,18 @@ docker-build-observer-without-netgo:
 
 .PHONY: docker-build-ghost
 docker-build-ghost:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/ghost:latest" -t "$(CONTAINER_REGISTRY)/ghost:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-ghost-debug
 docker-build-ghost-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/ghost-debug:latest" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(IMAGE_TAG)" .
 
 PHONY: docker-build-bootstrap
 docker-build-bootstrap:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/bootstrap:latest" -t "$(CONTAINER_REGISTRY)/bootstrap:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap:$(IMAGE_TAG)" .
 
@@ -402,7 +420,7 @@ tool-bootstrap: docker-build-bootstrap
 
 .PHONY: docker-build-bootstrap-transit
 docker-build-bootstrap-transit:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --no-cache \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --no-cache \
 	    --target production  \
 		-t "$(CONTAINER_REGISTRY)/bootstrap-transit:latest" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(IMAGE_TAG)" .
 
@@ -412,7 +430,7 @@ tool-transit: docker-build-bootstrap-transit
 
 .PHONY: docker-build-loader
 docker-build-loader:
-	docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --target production \
+	docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual  --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/loader:latest" -t "$(CONTAINER_REGISTRY)/loader:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/loader:$(IMAGE_TAG)" .
 
@@ -597,7 +615,7 @@ docker-all-tools: tool-util tool-remove-execution-fork
 
 PHONY: docker-build-util
 docker-build-util:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		-t "$(CONTAINER_REGISTRY)/util:latest" -t "$(CONTAINER_REGISTRY)/util:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/util:$(IMAGE_TAG)" .
 
 PHONY: tool-util
@@ -606,7 +624,7 @@ tool-util: docker-build-util
 
 PHONY: docker-build-remove-execution-fork
 docker-build-remove-execution-fork:
-	docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --target production \
+	docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
 		-t "$(CONTAINER_REGISTRY)/remove-execution-fork:latest" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(IMAGE_TAG)" .
 
 PHONY: tool-remove-execution-fork
diff --git a/cmd/Dockerfile b/cmd/Dockerfile
index 90075485922..d3660bd2b27 100644
--- a/cmd/Dockerfile
+++ b/cmd/Dockerfile
@@ -36,13 +36,15 @@ ARG GOARCH=amd64
 
 # TAGS can be overriden to modify the go build tags (e.g. build without netgo)
 ARG TAGS="netgo"
+# CGO_FLAG can be overwritten
+ARG CGO_FLAG
 
 # Keep Go's build cache between builds.
 # https://github.com/golang/go/issues/27719#issuecomment-514747274
 RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
     --mount=type=secret,id=git_creds,dst=/root/.netrc \
-    CGO_ENABLED=1 GOOS=linux go build --tags "${TAGS}" -ldflags "-extldflags -static \
+    CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "${TAGS}" -ldflags "-extldflags -static \
     -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
     -o ./app ${TARGET}
 
@@ -63,7 +65,7 @@ ARG GOARCH=amd64
 RUN --mount=type=ssh \
     --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=1 GOOS=linux go build --tags "netgo" -ldflags "-extldflags -static \
+    CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "netgo" -ldflags "-extldflags -static \
     -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
     -gcflags="all=-N -l" -o ./app ${TARGET}
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 3fa010ca6ae..c7361bde76b 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -10,8 +10,16 @@ else
 	RACE_FLAG :=
 endif
 
-# the crypto package uses BLST source files underneath which may use ADX insructions
-ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
+ifeq ($(shell uname -s),Linux)
+# detect ADX support on the CURRENT linux machine.
+	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+else
+# on non-linux machines, set the flag to 1 by default
+	ADX_SUPPORT := 1
+endif
+
+# the crypto package uses BLST source files underneath which may use ADX insructions.
 ifeq ($(ADX_SUPPORT), 1)
 # if ADX insructions are supported, default is to use a fast ADX BLST implementation 
 	CGO_FLAG :=
diff --git a/insecure/Makefile b/insecure/Makefile
index 9872f01b1d8..635d9a06ad7 100644
--- a/insecure/Makefile
+++ b/insecure/Makefile
@@ -8,7 +8,25 @@ else
 	RACE_FLAG :=
 endif
 
+# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
+ifeq ($(shell uname -s),Linux)
+# detect ADX support on the CURRENT linux machine.
+	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+else
+# on non-linux machines, set the flag to 1 by default
+	ADX_SUPPORT := 1
+endif
+
+# the crypto package uses BLST source files underneath which may use ADX insructions.
+ifeq ($(ADX_SUPPORT), 1)
+# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+	CGO_FLAG :=
+else
+# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+endif
+
 # runs all unit tests of the insecure module
 .PHONY: test
 test:
-	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./...
diff --git a/integration/Makefile b/integration/Makefile
index 7751b4ee333..b01c10d1954 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -8,6 +8,24 @@ else
 	RACE_FLAG :=
 endif
 
+# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
+ifeq ($(shell uname -s),Linux)
+# detect ADX support on the CURRENT linux machine.
+	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+else
+# on non-linux machines, set the flag to 1 by default
+	ADX_SUPPORT := 1
+endif
+
+# the crypto package uses BLST source files underneath which may use ADX insructions.
+ifeq ($(ADX_SUPPORT), 1)
+# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+	CGO_FLAG :=
+else
+# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+endif
+
 # Run the integration test suite
 .PHONY: integration-test
 integration-test: access-tests ghost-tests mvp-tests execution-tests verification-tests upgrades-tests collection-tests epochs-tests network-tests consensus-tests
@@ -22,53 +40,53 @@ ci-integration-test: access-tests ghost-tests mvp-tests epochs-tests consensus-t
 # Run unit tests for test utilities in this module
 .PHONY: test
 test:
-	go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests`
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests`
 
 .PHONY: access-tests
 access-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/...
 
 .PHONY: collection-tests
 collection-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/...
 
 .PHONY: consensus-tests
 consensus-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/...
 
 .PHONY: epochs-tests
 epochs-tests:
 	# Use a higher timeout of 20m for the suite of tests which span full epochs
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/...
 
 .PHONY: ghost-tests
 ghost-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/...
 
 .PHONY: mvp-tests
 mvp-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/...
 
 .PHONY: execution-tests
 execution-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/...
 
 .PHONY: verification-tests
 verification-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/...
 
 .PHONY: upgrades-tests
 upgrades-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/...
 
 .PHONY: network-tests
 network-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/...
 
 # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel
 .PHONY: bft-tests
 bft-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1
 
 
 ############################################################################################
diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile
index 8d474efd3dc..8ae85e43326 100644
--- a/integration/benchmark/cmd/manual/Dockerfile
+++ b/integration/benchmark/cmd/manual/Dockerfile
@@ -29,6 +29,9 @@ COPY . .
 FROM build-env as build-production
 WORKDIR /app
 
+# CGO_FLAG can be overwritten
+ARG CGO_FLAG
+
 # Keep Go's build cache between builds.
 # https://github.com/golang/go/issues/27719#issuecomment-514747274
 # Also, allow ssh access
@@ -36,7 +39,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
     --mount=type=ssh \
     cd integration && \
-    CGO_ENABLED=1 go build -ldflags "-extldflags -static" -o ./app ./${TARGET}
+    CGO_ENABLED=1 ${CGO_FLAG} go build -ldflags "-extldflags -static" -o ./app ./${TARGET}
 
 RUN mv /app/integration/app /app/app
 

From 5dba1c5bfd924a8188864e1fba1101a7645169c1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 31 May 2023 19:25:00 -0600
Subject: [PATCH 130/200] clarify BLST sigill message

---
 crypto/bls12381_utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index a01c46e05b8..72e4c010e11 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -14,7 +14,7 @@ package crypto
 // # include <unistd.h>
 // # include <string.h>
 // static void handler(int signum)
-// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__";
+// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with "CGO_CFLAGS=-O -D__BLST_PORTABLE__"\n";
 //		ssize_t n = write(2, &text, strlen(text));
 //     _exit(128+SIGILL);
 //     (void)n;

From f471e4b9aabe3d9f0d8597b8e3818046481012e0 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 31 May 2023 20:21:57 -0600
Subject: [PATCH 131/200] fix a bug in sigill string

---
 crypto/bls12381_utils.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 72e4c010e11..fa2a6ff65de 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -14,7 +14,7 @@ package crypto
 // # include <unistd.h>
 // # include <string.h>
 // static void handler(int signum)
-// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with "CGO_CFLAGS=-O -D__BLST_PORTABLE__"\n";
+// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with \"CGO_CFLAGS=-O -D__BLST_PORTABLE__\"\n";
 //		ssize_t n = write(2, &text, strlen(text));
 //     _exit(128+SIGILL);
 //     (void)n;

From d4b873db9e74bf66754bd3a147eb3d5b2b4b1a60 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 31 May 2023 20:23:09 -0600
Subject: [PATCH 132/200] update how cgo flag is passed to Dockerfile so that
 dittos aren't deleted

---
 Makefile                                    | 59 +++++++++++----------
 cmd/Dockerfile                              |  4 +-
 crypto/Makefile                             |  5 +-
 insecure/Makefile                           |  5 +-
 integration/Makefile                        |  5 +-
 integration/benchmark/cmd/manual/Dockerfile |  2 +-
 6 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/Makefile b/Makefile
index 8fcb8fa3ecb..93fa3be60ba 100644
--- a/Makefile
+++ b/Makefile
@@ -54,11 +54,12 @@ endif
 # the crypto package uses BLST source files underneath which may use ADX insructions.
 ifeq ($(ADX_SUPPORT), 1)
 # if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CGO_FLAG :=
+	CRYPTO_FLAG := ""
 else
 # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
+CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 cmd/collection/collection:
 	$(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go
@@ -259,59 +260,59 @@ docker-ci-integration:
 
 .PHONY: docker-build-collection
 docker-build-collection:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/collection:latest" -t "$(CONTAINER_REGISTRY)/collection:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/collection:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-collection-without-netgo
 docker-build-collection-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)"  .
 
 .PHONY: docker-build-collection-debug
 docker-build-collection-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/collection-debug:latest" -t "$(CONTAINER_REGISTRY)/collection-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection-debug:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-consensus
 docker-build-consensus:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/consensus:latest" -t "$(CONTAINER_REGISTRY)/consensus:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/consensus:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-consensus-without-netgo
 docker-build-consensus-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-consensus-debug
 docker-build-consensus-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/consensus-debug:latest" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-execution
 docker-build-execution:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/execution:latest" -t "$(CONTAINER_REGISTRY)/execution:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/execution:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-execution-without-netgo
 docker-build-execution-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-execution-debug
 docker-build-execution-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/execution-debug:latest" -t "$(CONTAINER_REGISTRY)/execution-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-debug:$(IMAGE_TAG)" .
 
 # build corrupt execution node for BFT testing
@@ -319,28 +320,28 @@ docker-build-execution-debug:
 docker-build-execution-corrupt:
 	# temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/execution-corrupted:latest" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-verification
 docker-build-verification:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/verification:latest" -t "$(CONTAINER_REGISTRY)/verification:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/verification:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-verification-without-netgo
 docker-build-verification-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-verification-debug
 docker-build-verification-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/verification-debug:latest" -t "$(CONTAINER_REGISTRY)/verification-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-debug:$(IMAGE_TAG)" .
 
 # build corrupt verification node for BFT testing
@@ -348,28 +349,28 @@ docker-build-verification-debug:
 docker-build-verification-corrupt:
 	# temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/verification-corrupted:latest" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-access
 docker-build-access:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/access:latest" -t "$(CONTAINER_REGISTRY)/access:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG)" -t  "$(CONTAINER_REGISTRY)/access:$(FLOW_GO_TAG)" .
 
 .PHONY: docker-build-access-without-netgo
 docker-build-access-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" .
 
 .PHONY: docker-build-access-debug
 docker-build-access-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access  --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/access  --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/access-debug:latest" -t "$(CONTAINER_REGISTRY)/access-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-debug:$(IMAGE_TAG)" .
 
 # build corrupt access node for BFT testing
@@ -377,21 +378,21 @@ docker-build-access-debug:
 docker-build-access-corrupt:
 	#temporarily make insecure/ a non-module to allow Docker to use corrupt builders there
 	./insecure/cmd/mods_override.sh
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/access-corrupted:latest" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(IMAGE_TAG)" .
 	./insecure/cmd/mods_restore.sh
 
 .PHONY: docker-build-observer
 docker-build-observer:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/observer:latest" -t "$(CONTAINER_REGISTRY)/observer:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-observer-without-netgo
 docker-build-observer-without-netgo:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \
 		--label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \
 		-t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" .
@@ -399,18 +400,18 @@ docker-build-observer-without-netgo:
 
 .PHONY: docker-build-ghost
 docker-build-ghost:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/ghost:latest" -t "$(CONTAINER_REGISTRY)/ghost:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost:$(IMAGE_TAG)" .
 
 .PHONY: docker-build-ghost-debug
 docker-build-ghost-debug:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \
 		-t "$(CONTAINER_REGISTRY)/ghost-debug:latest" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(IMAGE_TAG)" .
 
 PHONY: docker-build-bootstrap
 docker-build-bootstrap:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/bootstrap:latest" -t "$(CONTAINER_REGISTRY)/bootstrap:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap:$(IMAGE_TAG)" .
 
@@ -420,7 +421,7 @@ tool-bootstrap: docker-build-bootstrap
 
 .PHONY: docker-build-bootstrap-transit
 docker-build-bootstrap-transit:
-	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --no-cache \
+	docker build -f cmd/Dockerfile  --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT)  --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --no-cache \
 	    --target production  \
 		-t "$(CONTAINER_REGISTRY)/bootstrap-transit:latest" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(IMAGE_TAG)" .
 
@@ -430,7 +431,7 @@ tool-transit: docker-build-bootstrap-transit
 
 .PHONY: docker-build-loader
 docker-build-loader:
-	docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual  --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual  --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		--label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \
 		-t "$(CONTAINER_REGISTRY)/loader:latest" -t "$(CONTAINER_REGISTRY)/loader:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/loader:$(IMAGE_TAG)" .
 
@@ -615,7 +616,7 @@ docker-all-tools: tool-util tool-remove-execution-fork
 
 PHONY: docker-build-util
 docker-build-util:
-	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		-t "$(CONTAINER_REGISTRY)/util:latest" -t "$(CONTAINER_REGISTRY)/util:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/util:$(IMAGE_TAG)" .
 
 PHONY: tool-util
@@ -624,7 +625,7 @@ tool-util: docker-build-util
 
 PHONY: docker-build-remove-execution-fork
 docker-build-remove-execution-fork:
-	docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \
+	docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \
 		-t "$(CONTAINER_REGISTRY)/remove-execution-fork:latest" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(IMAGE_TAG)" .
 
 PHONY: tool-remove-execution-fork
diff --git a/cmd/Dockerfile b/cmd/Dockerfile
index d3660bd2b27..ade91976f7e 100644
--- a/cmd/Dockerfile
+++ b/cmd/Dockerfile
@@ -44,7 +44,7 @@ ARG CGO_FLAG
 RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
     --mount=type=secret,id=git_creds,dst=/root/.netrc \
-    CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "${TAGS}" -ldflags "-extldflags -static \
+    CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "${TAGS}" -ldflags "-extldflags -static \
     -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
     -o ./app ${TARGET}
 
@@ -65,7 +65,7 @@ ARG GOARCH=amd64
 RUN --mount=type=ssh \
     --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "netgo" -ldflags "-extldflags -static \
+    CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "netgo" -ldflags "-extldflags -static \
     -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X  'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \
     -gcflags="all=-N -l" -o ./app ${TARGET}
 
diff --git a/crypto/Makefile b/crypto/Makefile
index c7361bde76b..04cc9ae19d8 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -22,11 +22,12 @@ endif
 # the crypto package uses BLST source files underneath which may use ADX insructions.
 ifeq ($(ADX_SUPPORT), 1)
 # if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CGO_FLAG :=
+	CRYPTO_FLAG := ""
 else
 # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
+CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 # test all packages
 .PHONY: test
diff --git a/insecure/Makefile b/insecure/Makefile
index 635d9a06ad7..fd6fdae0dd9 100644
--- a/insecure/Makefile
+++ b/insecure/Makefile
@@ -20,11 +20,12 @@ endif
 # the crypto package uses BLST source files underneath which may use ADX insructions.
 ifeq ($(ADX_SUPPORT), 1)
 # if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CGO_FLAG :=
+	CRYPTO_FLAG := ""
 else
 # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
+CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 # runs all unit tests of the insecure module
 .PHONY: test
diff --git a/integration/Makefile b/integration/Makefile
index b01c10d1954..2d7eb14e867 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -20,11 +20,12 @@ endif
 # the crypto package uses BLST source files underneath which may use ADX insructions.
 ifeq ($(ADX_SUPPORT), 1)
 # if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CGO_FLAG :=
+	CRYPTO_FLAG := ""
 else
 # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__"
+	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
+CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 # Run the integration test suite
 .PHONY: integration-test
diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile
index 8ae85e43326..b93d44812a0 100644
--- a/integration/benchmark/cmd/manual/Dockerfile
+++ b/integration/benchmark/cmd/manual/Dockerfile
@@ -39,7 +39,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \
     --mount=type=cache,target=/root/.cache/go-build \
     --mount=type=ssh \
     cd integration && \
-    CGO_ENABLED=1 ${CGO_FLAG} go build -ldflags "-extldflags -static" -o ./app ./${TARGET}
+    CGO_ENABLED=1 CGO_FLAGS="${CGO_FLAG}" go build -ldflags "-extldflags -static" -o ./app ./${TARGET}
 
 RUN mv /app/integration/app /app/app
 

From fa5177f2fce86f3c53fc65d76a19f753659fa56b Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 1 Jun 2023 14:41:01 -0600
Subject: [PATCH 133/200] add cgo flag to mockgen commands

---
 Makefile                 | 6 +++---
 crypto/bls12381_utils.go | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 93fa3be60ba..c927ff4403a 100644
--- a/Makefile
+++ b/Makefile
@@ -145,9 +145,9 @@ generate-fvm-env-wrappers:
 .PHONY: generate-mocks
 generate-mocks: install-mock-generators
 	mockery --name '(Connector|PingInfoProvider)' --dir=network/p2p --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork"
-	mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults
-	mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester
-	mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network Network
+	$(CGO_FLAG) mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults
+	$(CGO_FLAG) mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester
+	$(CGO_FLAG) mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network Network
 	mockery --name='.*' --dir=integration/benchmark/mocksiface --case=underscore --output="integration/benchmark/mock" --outpkg="mock"
 	mockery --name=ExecutionDataStore --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock"
 	mockery --name=Downloader --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock"
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index fa2a6ff65de..f071b7b9f43 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -14,7 +14,7 @@ package crypto
 // # include <unistd.h>
 // # include <string.h>
 // static void handler(int signum)
-// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with \"CGO_CFLAGS=-O -D__BLST_PORTABLE__\"\n";
+// {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n";
 //		ssize_t n = write(2, &text, strlen(text));
 //     _exit(128+SIGILL);
 //     (void)n;

From 2a87898a7ba49eb4d1f04105ca17b4ca916fbd84 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 16 Aug 2023 20:45:21 -0600
Subject: [PATCH 134/200] remove test assertion strings in favor of PRG seed
 logging for test reproduction

---
 crypto/bls_test.go        | 101 +++++++++++---------------------------
 crypto/sign_test_utils.go |  49 +++++++++---------
 2 files changed, 52 insertions(+), 98 deletions(-)

diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 4047967be9b..0ead9fd3100 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -75,8 +75,7 @@ func TestBLSMainMethods(t *testing.T) {
 			// test a valid signature
 			result, err := pk.Verify(s, input, hasher)
 			assert.NoError(t, err)
-			assert.True(t, result,
-				"Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk)
+			assert.True(t, result)
 		}
 	})
 }
@@ -281,7 +280,7 @@ func TestBLSPOP(t *testing.T) {
 			// test a valid PoP
 			result, err := BLSVerifyPOP(pk, s)
 			require.NoError(t, err)
-			assert.True(t, result, "Verification should succeed:\n signature:%s\n private key:%s", s, sk)
+			assert.True(t, result)
 
 			// test with a valid but different key
 			seed[0] ^= 1
@@ -289,7 +288,7 @@ func TestBLSPOP(t *testing.T) {
 			require.NoError(t, err)
 			result, err = BLSVerifyPOP(wrongSk.PublicKey(), s)
 			require.NoError(t, err)
-			assert.False(t, result, "Verification should fail:\n signature:%s\n private key:%s", s, sk)
+			assert.False(t, result)
 		}
 	})
 
@@ -350,15 +349,11 @@ func TestBLSAggregateSignatures(t *testing.T) {
 		aggSig, err := AggregateBLSSignatures(sigs)
 		require.NoError(t, err)
 		// First check: check the signatures are equal
-		assert.Equal(t, aggSig, expectedSig,
-			"incorrect signature %s, should be %s, private keys are %s, input is %x",
-			aggSig, expectedSig, sks, input)
+		assert.Equal(t, aggSig, expectedSig)
 		// Second check: Verify the aggregated signature
 		valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac)
 		require.NoError(t, err)
-		assert.True(t, valid,
-			"Verification of %s failed, signature should be %s private keys are %s, input is %x",
-			aggSig, expectedSig, sks, input)
+		assert.True(t, valid)
 	})
 
 	// check if one signature is not correct
@@ -370,15 +365,11 @@ func TestBLSAggregateSignatures(t *testing.T) {
 		aggSig, err = AggregateBLSSignatures(sigs)
 		require.NoError(t, err)
 		// First check: check the signatures are not equal
-		assert.NotEqual(t, aggSig, expectedSig,
-			"signature %s shouldn't be %s private keys are %s, input is %x",
-			aggSig, expectedSig, sks, input)
+		assert.NotEqual(t, aggSig, expectedSig)
 		// Second check: multi-verification should fail
 		valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac)
 		require.NoError(t, err)
-		assert.False(t, valid,
-			"verification of signature %s should fail, it shouldn't be %s private keys are %s, input is %x",
-			aggSig, expectedSig, sks, input)
+		assert.False(t, valid)
 		sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature
 		require.NoError(t, err)
 	})
@@ -393,14 +384,10 @@ func TestBLSAggregateSignatures(t *testing.T) {
 		require.NoError(t, err)
 		expectedSig, err = aggSk.Sign(input, kmac)
 		require.NoError(t, err)
-		assert.NotEqual(t, aggSig, expectedSig,
-			"signature %s shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d",
-			aggSig, expectedSig, sks, input, randomIndex)
+		assert.NotEqual(t, aggSig, expectedSig)
 		valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac)
 		require.NoError(t, err)
-		assert.False(t, valid,
-			"signature %s should fail, shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d",
-			aggSig, expectedSig, sks, input, randomIndex)
+		assert.False(t, valid)
 	})
 
 	t.Run("invalid inputs", func(t *testing.T) {
@@ -500,9 +487,7 @@ func TestBLSAggregatePublicKeys(t *testing.T) {
 		keys := []PublicKey{pks[0], IdentityBLSPublicKey()}
 		aggPkWithIdentity, err := AggregateBLSPublicKeys(keys)
 		assert.NoError(t, err)
-		assert.True(t, aggPkWithIdentity.Equals(pks[0]),
-			"incorrect public key %s, should be %s",
-			aggPkWithIdentity, pks[0])
+		assert.True(t, aggPkWithIdentity.Equals(pks[0]))
 	})
 
 	t.Run("invalid inputs", func(t *testing.T) {
@@ -604,9 +589,7 @@ func TestBLSRemovePubKeys(t *testing.T) {
 		BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381)
 		require.True(t, ok)
 
-		assert.True(t, BLSkey.Equals(partialPk),
-			"incorrect key %s, should be %s, keys are %s, index is %d",
-			partialPk, BLSkey, pks, pkToRemoveNum)
+		assert.True(t, BLSkey.Equals(partialPk))
 	})
 
 	// remove an extra key and check inequality
@@ -617,9 +600,7 @@ func TestBLSRemovePubKeys(t *testing.T) {
 
 		BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381)
 		require.True(t, ok)
-		assert.False(t, BLSkey.Equals(partialPk),
-			"incorrect key %s, should not be %s, keys are %s, index is %d, extra key is %s",
-			partialPk, BLSkey, pks, pkToRemoveNum, extraPk)
+		assert.False(t, BLSkey.Equals(partialPk))
 	})
 
 	// specific test to remove all keys
@@ -634,9 +615,7 @@ func TestBLSRemovePubKeys(t *testing.T) {
 		BLSRandomPk, ok := randomPk.(*pubKeyBLSBLS12381)
 		require.True(t, ok)
 
-		assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk),
-			"incorrect key %s, should be infinity point, keys are %s",
-			identityPk, pks)
+		assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk))
 	})
 
 	// specific test with an empty slice of keys to remove
@@ -647,9 +626,7 @@ func TestBLSRemovePubKeys(t *testing.T) {
 		aggBLSkey, ok := aggPk.(*pubKeyBLSBLS12381)
 		require.True(t, ok)
 
-		assert.True(t, aggBLSkey.Equals(partialPk),
-			"incorrect key %s, should be %s",
-			partialPk, aggBLSkey)
+		assert.True(t, aggBLSkey.Equals(partialPk))
 	})
 
 	t.Run("invalid inputs", func(t *testing.T) {
@@ -702,9 +679,7 @@ func TestBLSBatchVerify(t *testing.T) {
 	t.Run("all signatures are valid", func(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac)
 		require.NoError(t, err)
-		assert.Equal(t, valid, expectedValid,
-			"Verification of %s failed, private keys are %s, input is %x, results is %v",
-			sigs, sks, input, valid)
+		assert.Equal(t, valid, expectedValid)
 	})
 
 	// valid signatures but indices aren't correct: sig[i] is correct under pks[j]
@@ -719,9 +694,7 @@ func TestBLSBatchVerify(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac)
 		require.NoError(t, err)
 		expectedValid[i], expectedValid[j] = false, false
-		assert.Equal(t, valid, expectedValid,
-			"Verification of %s failed, private keys are %s, input is %x, results is %v",
-			sigs, sks, input, valid)
+		assert.Equal(t, valid, expectedValid)
 
 		// restore keys
 		pks[i], pks[j] = pks[j], pks[i]
@@ -732,9 +705,7 @@ func TestBLSBatchVerify(t *testing.T) {
 	t.Run("one valid signature", func(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac)
 		require.NoError(t, err)
-		assert.Equal(t, expectedValid[:1], valid,
-			"Verification of %s failed, private keys are %s, input is %x, results is %v",
-			sigs[:1], sks[:1], input, valid)
+		assert.Equal(t, expectedValid[:1], valid)
 	})
 
 	// pick a random number of invalid signatures
@@ -759,9 +730,7 @@ func TestBLSBatchVerify(t *testing.T) {
 
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac)
 		require.NoError(t, err)
-		assert.Equal(t, expectedValid, valid,
-			"Verification of %s failed\n private keys are %s\n input is %x\n results is %v",
-			sigs, sks, input, valid)
+		assert.Equal(t, expectedValid, valid)
 	})
 
 	// all signatures are invalid
@@ -776,9 +745,7 @@ func TestBLSBatchVerify(t *testing.T) {
 
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac)
 		require.NoError(t, err)
-		assert.Equal(t, valid, expectedValid,
-			"Verification of %s failed, private keys are %s, input is %x, results is %v",
-			sigs, sks, input, valid)
+		assert.Equal(t, valid, expectedValid)
 	})
 
 	// test the empty list case
@@ -786,8 +753,7 @@ func TestBLSBatchVerify(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac)
 		require.Error(t, err)
 		assert.True(t, IsBLSAggregateEmptyListError(err))
-		assert.Equal(t, valid, expectedValid[:0],
-			"verification should fail with empty list key, got %v", valid)
+		assert.Equal(t, valid, expectedValid[:0])
 	})
 
 	// test incorrect inputs
@@ -798,8 +764,7 @@ func TestBLSBatchVerify(t *testing.T) {
 		valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac)
 		require.Error(t, err)
 		assert.True(t, IsInvalidInputsError(err))
-		assert.Equal(t, valid, expectedValid,
-			"verification should fail with incorrect input lenghts, got %v", valid)
+		assert.Equal(t, valid, expectedValid)
 	})
 
 	// test wrong hasher
@@ -811,8 +776,7 @@ func TestBLSBatchVerify(t *testing.T) {
 		require.Error(t, err)
 		assert.True(t, IsNilHasherError(err))
 
-		assert.Equal(t, valid, expectedValid,
-			"verification should fail with nil hasher, got %v", valid)
+		assert.Equal(t, valid, expectedValid)
 	})
 
 	// test wrong key
@@ -825,8 +789,7 @@ func TestBLSBatchVerify(t *testing.T) {
 		require.Error(t, err)
 		assert.True(t, IsNotBLSKeyError(err))
 
-		assert.Equal(t, valid, expectedValid,
-			"verification should fail with invalid key, got %v", valid)
+		assert.Equal(t, valid, expectedValid)
 	})
 }
 
@@ -962,9 +925,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		// Verify the aggregated signature
 		valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs)
 		require.NoError(t, err)
-		assert.True(t, valid,
-			"Verification of %s failed, should be valid, private keys are %s, inputs are %x, input public keys are %s",
-			aggSig, sks, inputMsgs, inputPks)
+		assert.True(t, valid)
 	})
 
 	// check if one of the signatures is not correct
@@ -979,9 +940,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		require.NoError(t, err)
 		valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs)
 		require.NoError(t, err)
-		assert.False(t, valid,
-			"Verification of %s should fail, private keys are %s, inputs are %x, input public keys are %s",
-			aggSig, sks, inputMsgs, inputPks)
+		assert.False(t, valid)
 	})
 
 	// test the empty keys case
@@ -989,7 +948,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsBLSAggregateEmptyListError(err))
-		assert.False(t, valid, "verification should fail with an empty key list")
+		assert.False(t, valid)
 	})
 
 	// test inconsistent input arrays
@@ -998,13 +957,13 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs[:sigsNum-1], inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsInvalidInputsError(err))
-		assert.False(t, valid, "verification should fail with inconsistent messages and hashers")
+		assert.False(t, valid)
 
 		// empty key list
 		valid, err = VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsBLSAggregateEmptyListError(err))
-		assert.False(t, valid, "verification should fail with empty list key")
+		assert.False(t, valid)
 
 		// nil hasher
 		tmp := inputKmacs[0]
@@ -1012,7 +971,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsNilHasherError(err))
-		assert.False(t, valid, "verification should fail with nil hasher")
+		assert.False(t, valid)
 		inputKmacs[0] = tmp
 
 		// wrong key
@@ -1021,7 +980,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) {
 		valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs)
 		assert.Error(t, err)
 		assert.True(t, IsNotBLSKeyError(err))
-		assert.False(t, valid, "verification should fail with nil hasher")
+		assert.False(t, valid)
 		inputPks[0] = tmpPK
 	})
 
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 06179a01989..8f00a0c77e5 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -75,15 +75,13 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
 			// test a valid signature
 			result, err := pk.Verify(s, input, halg)
 			require.NoError(t, err)
-			assert.True(t, result, fmt.Sprintf(
-				"Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+			assert.True(t, result)
 
 			// test with a different message
 			input[0] ^= 1
 			result, err = pk.Verify(s, input, halg)
 			require.NoError(t, err)
-			assert.False(t, result, fmt.Sprintf(
-				"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+			assert.False(t, result)
 			input[0] ^= 1
 
 			// test with a valid but different key
@@ -92,8 +90,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
 			require.NoError(t, err)
 			result, err = wrongSk.PublicKey().Verify(s, input, halg)
 			require.NoError(t, err)
-			assert.False(t, result, fmt.Sprintf(
-				"Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk))
+			assert.False(t, result)
 
 			// test a wrong signature length
 			invalidLen := rand.Intn(2 * len(s)) // try random invalid lengths
@@ -103,9 +100,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) {
 			invalidSig := make([]byte, invalidLen)
 			result, err = pk.Verify(invalidSig, input, halg)
 			require.NoError(t, err)
-			assert.False(t, result, fmt.Sprintf(
-				"Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen))
-
+			assert.False(t, result)
 		}
 	})
 }
@@ -172,7 +167,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 				require.Equal(t, read, KeyGenSeedMinLen)
 				require.NoError(t, err)
 				sk, err := GeneratePrivateKey(salg, seed)
-				assert.Nil(t, err, "the key generation failed")
+				assert.Nil(t, err)
 				seed[0] ^= 1 // alter the seed to get a new private key
 				distinctSk, err := GeneratePrivateKey(salg, seed)
 				require.NoError(t, err)
@@ -180,10 +175,10 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 				// check private key encoding
 				skBytes := sk.Encode()
 				skCheck, err := DecodePrivateKey(salg, skBytes)
-				require.Nil(t, err, "the key decoding failed")
-				assert.True(t, sk.Equals(skCheck), "key equality check failed")
+				require.Nil(t, err)
+				assert.True(t, sk.Equals(skCheck))
 				skCheckBytes := skCheck.Encode()
-				assert.Equal(t, skBytes, skCheckBytes, "keys should be equal")
+				assert.Equal(t, skBytes, skCheckBytes)
 				distinctSkBytes := distinctSk.Encode()
 				assert.NotEqual(t, skBytes, distinctSkBytes)
 
@@ -192,23 +187,23 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 				pkBytes := pk.Encode()
 				pkCheck, err := DecodePublicKey(salg, pkBytes)
 				require.Nil(t, err)
-				assert.True(t, pk.Equals(pkCheck), "key equality check failed")
+				assert.True(t, pk.Equals(pkCheck))
 				pkCheckBytes := pkCheck.Encode()
-				assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal")
+				assert.Equal(t, pkBytes, pkCheckBytes)
 				distinctPkBytes := distinctSk.PublicKey().Encode()
-				assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different")
+				assert.NotEqual(t, pkBytes, distinctPkBytes)
 
 				// same for the compressed encoding
 				// skip is BLS is used and compression isn't supported
 				if !(salg == BLSBLS12381 && !isG2Compressed()) {
 					pkComprBytes := pk.EncodeCompressed()
 					pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes)
-					require.Nil(t, err, "the key decoding failed")
-					assert.True(t, pk.Equals(pkComprCheck), "key equality check failed")
+					require.Nil(t, err)
+					assert.True(t, pk.Equals(pkComprCheck))
 					pkCheckComprBytes := pkComprCheck.EncodeCompressed()
-					assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal")
+					assert.Equal(t, pkComprBytes, pkCheckComprBytes)
 					distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed()
-					assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different")
+					assert.NotEqual(t, pkComprBytes, distinctPkComprBytes)
 				}
 			}
 		})
@@ -228,7 +223,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) {
 			groupOrder[BLSBLS12381] = BLS12381Order
 
 			sk, err := DecodePrivateKey(salg, groupOrder[salg])
-			require.Error(t, err, "the key decoding should fail - private key value is too large")
+			require.Error(t, err)
 			assert.True(t, IsInvalidInputsError(err))
 			assert.Nil(t, sk)
 		})
@@ -293,12 +288,12 @@ func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorit
 		pk4 := sk4.PublicKey()
 
 		// tests
-		assert.True(t, sk1.Equals(sk2), "key equality should return true")
-		assert.True(t, pk1.Equals(pk2), "key equality should return true")
-		assert.False(t, sk1.Equals(sk3), "key equality should return false")
-		assert.False(t, pk1.Equals(pk3), "key equality should return false")
-		assert.False(t, sk1.Equals(sk4), "key equality should return false")
-		assert.False(t, pk1.Equals(pk4), "key equality should return false")
+		assert.True(t, sk1.Equals(sk2))
+		assert.True(t, pk1.Equals(pk2))
+		assert.False(t, sk1.Equals(sk3))
+		assert.False(t, pk1.Equals(pk3))
+		assert.False(t, sk1.Equals(sk4))
+		assert.False(t, pk1.Equals(pk4))
 	})
 }
 

From 8ffac589dc4dec581c91715e4e261add09625715 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 17 Aug 2023 14:42:51 -0600
Subject: [PATCH 135/200] add interface implementation sanity checks

---
 crypto/bls.go   | 6 ++++++
 crypto/ecdsa.go | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index 447a203033b..b5ed13bd83d 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -349,6 +349,9 @@ func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (Publ
 }
 
 // prKeyBLSBLS12381 is the private key of BLS using BLS12_381, it implements PrivateKey
+
+var _ PrivateKey = (*prKeyBLSBLS12381)(nil)
+
 type prKeyBLSBLS12381 struct {
 	// public key
 	pk *pubKeyBLSBLS12381
@@ -426,6 +429,9 @@ func (sk *prKeyBLSBLS12381) String() string {
 
 // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381,
 // it implements PublicKey.
+
+var _ PublicKey = (*pubKeyBLSBLS12381)(nil)
+
 type pubKeyBLSBLS12381 struct {
 	// The package guarantees an instance is only created with a point
 	// on the correct G2 subgroup. No membership check is needed when the
diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go
index dca3604570a..67d97e9a854 100644
--- a/crypto/ecdsa.go
+++ b/crypto/ecdsa.go
@@ -321,7 +321,10 @@ func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error)
 	return &pubKeyECDSA{a, goPubKey}, nil
 }
 
-// prKeyECDSA is the private key of ECDSA, it implements the generic PrivateKey
+// prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey
+
+var _ PrivateKey = (*prKeyECDSA)(nil)
+
 type prKeyECDSA struct {
 	// the signature algo
 	alg *ecdsaAlgo
@@ -392,6 +395,9 @@ func (sk *prKeyECDSA) String() string {
 }
 
 // pubKeyECDSA is the public key of ECDSA, it implements PublicKey
+
+var _ PublicKey = (*pubKeyECDSA)(nil)
+
 type pubKeyECDSA struct {
 	// the signature algo
 	alg *ecdsaAlgo

From 8415a45958a8ea41d863a7b3d75ba0a6fd743cde Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 17 Aug 2023 23:58:36 -0600
Subject: [PATCH 136/200] add faster scalar mult in E2 for small expos

---
 crypto/bls12381_utils.c       | 50 ++++++++++++++++++++++++++---------
 crypto/bls12381_utils.h       |  3 ++-
 crypto/bls12381_utils_test.go |  3 ++-
 crypto/dkg_core.c             |  2 +-
 crypto/sign_test_utils.go     |  3 ++-
 5 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index d88bfa3aaa8..97725545b26 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -95,6 +95,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
 // if base = b*R, res = b^expo * R
 // In general, res = base^expo * R^(-expo+1)
 // `expo` is encoded as a little-endian limb_t table of length `expo_len`.
+// `expo` must be non-zero.
 // TODO: clean up?
 void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) {
     // mask of the most significant bit
@@ -103,15 +104,15 @@ void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_le
 	int index = 0;
 
     expo += expo_len;
-	// Treat most significant zero limbs
+	// process most significant zero limbs
 	while((index < expo_len) && (*(--expo) == 0)) {
 		index++;
     }
-	// Treat the most significant zero bits
+	// process the most significant zero bits
 	while((*expo & mask) == 0) {
 		mask >>= 1;
     }
-	// Treat the first `1` bit
+	// process the first `1` bit
 	Fr_copy(res, base);
 	mask >>= 1;
 	// Scan all limbs of the exponent
@@ -909,6 +910,11 @@ void E2_add(E2* res, const E2* a, const E2* b) {
     POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); 
 }
 
+// generic point double that must handle point at infinity
+void E2_double(E2* res, const E2* a) {
+    POINTonE2_double((POINTonE2*)res, (POINTonE2*)a); 
+}
+
 // Point negation: res = -a
 void E2_neg(E2* res, const E2* a) {
     // TODO: optimize
@@ -924,14 +930,34 @@ void E2_mult(E2* res, const E2* p, const Fr* expo) {
     vec_zero(&tmp, sizeof(tmp)); 
 }
 
-// Exponentiation of a generic point `a` in E2 by a byte exponent.
+// Exponentiation of a generic point `a` in E2 by a byte exponent,
+// using a classic double-and-add algorithm (non constant-time)
 void  E2_mult_small_expo(E2* res, const E2* p, const byte expo) {
-    pow256 pow_expo; 
-    vec_zero(&pow_expo, sizeof(pow256)); 
-    pow_expo[0] = expo; // `pow256` uses bytes little endian.
-    // TODO: to bench against a specific version of mult with 8 bits expo
-    POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, pow_expo);
-    pow_expo[0] = 0;
+    // return early if expo is zero
+    if (expo == 0) {
+        E2_set_infty(res);
+        return;
+    }
+    // expo is non zero
+
+	byte mask = 1<<7;
+	// process the most significant zero bits
+	while((expo & mask) == 0) {
+		mask >>= 1;
+    }
+
+	// process the first `1` bit
+    E2 tmp;
+	E2_copy(&tmp, p);
+	mask >>= 1;
+    // scan the remaining bits
+    for ( ; mask != 0 ; mask >>= 1 ) {
+        E2_double(&tmp, &tmp);
+        if (expo & mask) {
+            E2_add(&tmp, &tmp, p);  
+        }
+    }
+    E2_copy(res, &tmp);
 }
 
 // Exponentiation of generator g2 of G2, res = expo.g2
@@ -1126,8 +1152,8 @@ void E1_print_(char* s, const E1* p, const int jacob) {
 
 void E2_print_(char* s, const E2* p, const int jacob) {
     E2 a; E2_copy(&a, p);
-    if (strlen(s)) if (!jacob) E2_to_affine(&a, &a);
-    printf("[%s]:\n", s);
+    if (!jacob) E2_to_affine(&a, &a);
+    if (strlen(s)) printf("[%s]:\n", s);
     Fp2_print_("", &(a.x));
     Fp2_print_("", &(a.y));
     if (jacob) Fp2_print_("", &(a.z));
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index d35e0298c59..ae899877be2 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -113,6 +113,7 @@ void        G2_mult_gen(E2*, const Fr*);
 void        E2_mult(E2*, const E2*, const Fr*);
 void        E2_mult_small_expo(E2*, const E2*, const byte);
 void        E2_add(E2* res, const E2* a, const E2* b);
+void        E2_double(E2* res, const E2* a);
 void        E2_neg(E2*, const E2*);
 void        E2_sum_vector(E2*, const E2*, const int);
 void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
@@ -129,7 +130,7 @@ void        Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int);
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
 
 // Debugging related functions
-#define DEBUG 0
+#define DEBUG 1
 #if (DEBUG == 1)
 #include <stdio.h>
 void     bytes_print_(char*, byte*, int);
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 067ac979f7e..a9efd543ed1 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -1,6 +1,7 @@
 package crypto
 
 import (
+	"crypto/rand"
 	"encoding/hex"
 	mrand "math/rand"
 	"testing"
@@ -54,7 +55,7 @@ func TestScalarMultBLS12381(t *testing.T) {
 // G1 and G2 scalar multiplication
 func BenchmarkScalarMult(b *testing.B) {
 	seed := make([]byte, securityBits/8)
-	_, err := mrand.Read(seed)
+	_, err := rand.Read(seed)
 	require.NoError(b, err)
 
 	var expo scalar
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 15e8e0c48b3..674973e1d8a 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -35,7 +35,7 @@ void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int degree, const
 static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){        
     E2_set_infty(y);
     for (int i = degree; i >= 0 ; i--) {
-        E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo
+        E2_mult_small_expo(y, y, x);
         E2_add(y, y, &A[i]);
     }
 }
diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go
index 8f00a0c77e5..9ecc684a4be 100644
--- a/crypto/sign_test_utils.go
+++ b/crypto/sign_test_utils.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	mrand "math/rand"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -13,7 +14,7 @@ import (
 )
 
 func getPRG(t *testing.T) *mrand.Rand {
-	random := int64(1685491239186156000) //time.Now().UnixNano()
+	random := time.Now().UnixNano()
 	t.Logf("rng seed is %d", random)
 	rng := mrand.New(mrand.NewSource(random))
 	return rng

From a408dec46ae1e1c5366dfe89bef212d43b4438b6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 12:35:33 -0600
Subject: [PATCH 137/200] more implementation check sanity check

---
 crypto/bls_thresholdsign.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 2f05ed72c42..9451f4fb6dc 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -34,6 +34,8 @@ import (
 
 // blsThresholdSignatureParticipant implements ThresholdSignatureParticipant
 // based on the BLS signature scheme
+var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil)
+
 type blsThresholdSignatureParticipant struct {
 	// embed the follower
 	*blsThresholdSignatureInspector
@@ -45,6 +47,8 @@ type blsThresholdSignatureParticipant struct {
 
 // blsThresholdSignatureInspector implements ThresholdSignatureInspector
 // based on the BLS signature scheme
+var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil)
+
 type blsThresholdSignatureInspector struct {
 	// size of the group
 	size int

From 65ee3bf1ab143e77d74e29a3ebcfdc9e3ac72c38 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 13:13:39 -0600
Subject: [PATCH 138/200] clean up some todos and add global g2 key

---
 crypto/bls12381_utils.c  |  1 -
 crypto/bls12381_utils.go |  5 +++++
 crypto/bls_core.c        |  2 +-
 crypto/bls_multisig.go   | 10 ++--------
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 97725545b26..0614af773fe 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -165,7 +165,6 @@ static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES])
 }
 
 // internal type of BLST `pow256` uses bytes little endian.
-// TODO: check endianness!!
 static void pow256_from_Fr(pow256 ret, const Fr* in) {
     le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES);
 }
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index f071b7b9f43..40580ca7239 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -67,6 +67,8 @@ var g2SerHeader byte // g2
 // `g1“ serialization
 var g1Serialization []byte
 
+var g2PublicKey pubKeyBLSBLS12381
+
 // initialization of BLS12-381 curve
 func initBLS12381() {
 	if isG1Compressed() {
@@ -80,6 +82,9 @@ func initBLS12381() {
 	} else {
 		g2SerHeader = 0x40
 	}
+	// set a global point to infinity
+	C.E2_set_infty((*C.E2)(&g2PublicKey.point))
+	g2PublicKey.isIdentity = true
 }
 
 func (a *scalar) String() string {
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 0771269ed86..39b9e243fd1 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -398,7 +398,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
             Fr_set_limb(&one, 1);
             Fr_add(&r, &r, &one); 
             // multiply public key and signature by the same random exponent r
-            E2_mult(&pks[i], &pks_input[i], &r);  // TODO: faster version for short expos?
+            E2_mult(&pks[i], &pks_input[i], &r);
             E1_mult(&sigs[i], &sigs[i], &r);   
         } 
     }
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 7adbb0c1f45..7f57cd09888 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -192,15 +192,9 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 }
 
 // IdentityBLSPublicKey returns an identity public key which corresponds to the point
-// at infinity in G2 (identity element of G2).
-// TODO: return a constant key instead of a newly allocated one
+// at infinity in G2 (identity element g2).
 func IdentityBLSPublicKey() PublicKey {
-
-	identity := *newPubKeyBLSBLS12381(nil)
-	// set the point to infinity
-	C.E2_set_infty((*C.E2)(&identity.point))
-	identity.isIdentity = true
-	return &identity
+	return &g2PublicKey
 }
 
 // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key.

From b3797441d28be8009b4262218e45fbbeb5529e22 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 13:45:51 -0600
Subject: [PATCH 139/200] address more TODOs

---
 crypto/bls12381_utils.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 0614af773fe..b2385baa37f 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -95,8 +95,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
 // if base = b*R, res = b^expo * R
 // In general, res = base^expo * R^(-expo+1)
 // `expo` is encoded as a little-endian limb_t table of length `expo_len`.
-// `expo` must be non-zero.
-// TODO: clean up?
+// TODO: could be deleted
 void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) {
     // mask of the most significant bit
 	const limb_t msb_mask =  (limb_t)1<<((sizeof(limb_t)<<3)-1);
@@ -108,29 +107,38 @@ void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_le
 	while((index < expo_len) && (*(--expo) == 0)) {
 		index++;
     }
+    // if expo is zero
+    if (index == expo_len) {
+        Fr_copy(res, base);
+        return;
+    }
+    // expo is non zero
 	// process the most significant zero bits
 	while((*expo & mask) == 0) {
 		mask >>= 1;
     }
+    Fr tmp;
 	// process the first `1` bit
-	Fr_copy(res, base);
+	Fr_copy(&tmp, base);
 	mask >>= 1;
 	// Scan all limbs of the exponent
 	for ( ; index < expo_len; expo--) {
 		// Scan all bits 
 		for ( ; mask != 0 ; mask >>= 1 ) {
 			// square
-			Fr_squ_montg(res, res);
+			Fr_squ_montg(&tmp, &tmp);
 			// multiply
 			if (*expo & mask) {
-				Fr_mul_montg(res, res ,base);
+				Fr_mul_montg(&tmp, &tmp ,base);
 			}
 		}
 		mask = msb_mask;
         index++;
 	}
+    Fr_copy(res, &tmp);
 }
 
+// TODO: could be deleted
 void Fr_inv_exp_montg(Fr *res, const Fr *a) {
     Fr r_2;
     Fr_copy(&r_2, (Fr*)BLS12_381_r);
@@ -217,8 +225,7 @@ void Fr_write_bytes(byte *bin, const Fr* a) {
 // maps big-endian bytes into an Fr element using modular reduction
 // Input is byte-big-endian, output is Fr (internally vec256)
 // TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0);
-static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n)
-{
+static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) {
     Fr digit, radix;
     Fr_set_zero(out);
     Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2
@@ -610,7 +617,6 @@ void E1_add(E1* res, const E1* a, const E1* b) {
 
 // Point negation: res = -a
 void E1_neg(E1* res, const E1* a) {
-    // TODO: optimize
     E1_copy(res, a);
     POINTonE1_cneg((POINTonE1*)res, 1);
 }
@@ -916,7 +922,6 @@ void E2_double(E2* res, const E2* a) {
 
 // Point negation: res = -a
 void E2_neg(E2* res, const E2* a) {
-    // TODO: optimize
     E2_copy(res, a);
     POINTonE2_cneg((POINTonE2*)res, 1);
 }

From 9d6c7c7d7c245c86ae97544a5f213d21a67545db Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 14:14:10 -0600
Subject: [PATCH 140/200] add c-formatting target

---
 crypto/.clang-format               |  192 ++++
 crypto/Makefile                    |    9 +
 crypto/bls12381_utils.c            | 1593 ++++++++++++++--------------
 crypto/bls12381_utils.h            |  210 ++--
 crypto/bls_core.c                  |  789 +++++++-------
 crypto/bls_include.h               |   20 +-
 crypto/bls_thresholdsign_core.c    |  188 ++--
 crypto/bls_thresholdsign_include.h |    6 +-
 crypto/blst_include.h              |   55 +-
 crypto/dkg_core.c                  |  114 +-
 crypto/dkg_include.h               |   14 +-
 11 files changed, 1738 insertions(+), 1452 deletions(-)
 create mode 100644 crypto/.clang-format

diff --git a/crypto/.clang-format b/crypto/.clang-format
new file mode 100644
index 00000000000..48b2c678323
--- /dev/null
+++ b/crypto/.clang-format
@@ -0,0 +1,192 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+QualifierAlignment: Leave
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+PackConstructorInitializers: BinPack
+BasedOnStyle:    ''
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+AllowAllConstructorInitializersOnNextLine: true
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertTrailingCommas: None
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+PPIndentWidth:   -1
+ReferenceAlignment: Pointer
+ReflowComments:  true
+RemoveBracesLLVM: false
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  BeforeNonEmptyParentheses: false
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+  - NS_SWIFT_NAME
+  - CF_SWIFT_NAME
+...
+
diff --git a/crypto/Makefile b/crypto/Makefile
index 04cc9ae19d8..28e7a5f6f2f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -29,6 +29,15 @@ else
 endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
+# format
+.PHONY: c-format
+c-format:
+	clang-format -style=llvm -dump-config > .clang-format
+	clang-format -i *.c
+	clang-format -i *.h
+	rm -f .clang-format
+	git diff --exit-code
+
 # test all packages
 .PHONY: test
 test:
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index b2385baa37f..665f3853236 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1,10 +1,10 @@
 // this file contains utility functions for the curve BLS 12-381
-// these tools are shared by the BLS signature scheme, the BLS based threshold signature
-// and the BLS distributed key generation protocols
+// these tools are shared by the BLS signature scheme, the BLS based threshold
+// signature and the BLS distributed key generation protocols
 
 #include "bls12381_utils.h"
-#include "bls_include.h"
 #include "assert.h"
+#include "bls_include.h"
 
 // compile all blst C src along with this file
 #include "blst_src.c"
@@ -12,83 +12,87 @@
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
-// R mod r = (1<<256)%r 
-const Fr BLS12_381_rR = {{  \
-    TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \
-    TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \
+// R mod r = (1<<256)%r
+const Fr BLS12_381_rR = {{
+    TO_LIMB_T(0x1824b159acc5056f),
+    TO_LIMB_T(0x998c4fefecbc4ff5),
+    TO_LIMB_T(0x5884b7fa00034802),
+    TO_LIMB_T(0x00000001fffffffe),
 }};
 
 // returns true if a == 0 and false otherwise
-bool Fr_is_zero(const Fr* a) {
-    return bytes_are_zero((const byte*)a, sizeof(Fr));
+bool Fr_is_zero(const Fr *a) {
+  return bytes_are_zero((const byte *)a, sizeof(Fr));
 }
 
 // returns true if a == b and false otherwise
-bool Fr_is_equal(const Fr* a, const Fr* b) {
-    return vec_is_equal(a, b, sizeof(Fr));
+bool Fr_is_equal(const Fr *a, const Fr *b) {
+  return vec_is_equal(a, b, sizeof(Fr));
 }
 
 // sets `a` to limb `l`
-void Fr_set_limb(Fr* a, const limb_t l){
-    vec_zero((byte*)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t));
-    *((limb_t*)a) = l;
+void Fr_set_limb(Fr *a, const limb_t l) {
+  vec_zero((byte *)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t));
+  *((limb_t *)a) = l;
 }
 
-void Fr_copy(Fr* res, const Fr* a) {
-    if ((uptr_t)a==(uptr_t)res) {
-        return;
-    }
-    vec_copy((byte*)res, (byte*)a, sizeof(Fr));
+void Fr_copy(Fr *res, const Fr *a) {
+  if ((uptr_t)a == (uptr_t)res) {
+    return;
+  }
+  vec_copy((byte *)res, (byte *)a, sizeof(Fr));
 }
 
 // sets `a` to 0
-void Fr_set_zero(Fr* a){
-    vec_zero((byte*)a, sizeof(Fr));
-}
+void Fr_set_zero(Fr *a) { vec_zero((byte *)a, sizeof(Fr)); }
 
 void Fr_add(Fr *res, const Fr *a, const Fr *b) {
-    add_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r);
+  add_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r);
 }
 
 void Fr_sub(Fr *res, const Fr *a, const Fr *b) {
-    sub_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r);
+  sub_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r);
 }
 
 void Fr_neg(Fr *res, const Fr *a) {
-    cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r);
+  cneg_mod_256((limb_t *)res, (limb_t *)a, 1, BLS12_381_r);
 }
 
 // res = a*b*R^(-1)
 void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) {
-    mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0);
+  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r, r0);
 }
 
 // res = a^2 * R^(-1)
 void Fr_squ_montg(Fr *res, const Fr *a) {
-    sqr_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0);
+  sqr_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0);
 }
 
 // res = a*R
 void Fr_to_montg(Fr *res, const Fr *a) {
-    mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0);
+  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r,
+                      r0);
 }
 
 // res = a*R^(-1)
 void Fr_from_montg(Fr *res, const Fr *a) {
-    from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0);
+  from_mont_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0);
 }
 
 // res = a^(-1)*R
 void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
-    // copied and modified from BLST code
-    // Copyright Supranational LLC
-    static const vec256 rx2 =   { /* left-aligned value of the modulus */
-        TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd),
-        TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90),
-    };
-    vec512 temp;
-    ct_inverse_mod_256(temp, (limb_t*)a, BLS12_381_r, rx2);
-    redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0);
+  // copied and modified from BLST code
+  // Copyright Supranational LLC
+  static const vec256 rx2 = {
+      /* left-aligned value of the modulus */
+      TO_LIMB_T(0xfffffffe00000002),
+      TO_LIMB_T(0xa77b4805fffcb7fd),
+      TO_LIMB_T(0x6673b0101343b00a),
+      TO_LIMB_T(0xe7db4ea6533afa90),
+  };
+  vec512 temp;
+  ct_inverse_mod_256(temp, (limb_t *)a, BLS12_381_r, rx2);
+  redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0);
 }
 
 // result is in Montgomery form if base is in montgomery form
@@ -96,85 +100,85 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
 // In general, res = base^expo * R^(-expo+1)
 // `expo` is encoded as a little-endian limb_t table of length `expo_len`.
 // TODO: could be deleted
-void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) {
-    // mask of the most significant bit
-	const limb_t msb_mask =  (limb_t)1<<((sizeof(limb_t)<<3)-1);
-	limb_t mask = msb_mask;
-	int index = 0;
-
-    expo += expo_len;
-	// process most significant zero limbs
-	while((index < expo_len) && (*(--expo) == 0)) {
-		index++;
-    }
-    // if expo is zero
-    if (index == expo_len) {
-        Fr_copy(res, base);
-        return;
-    }
-    // expo is non zero
-	// process the most significant zero bits
-	while((*expo & mask) == 0) {
-		mask >>= 1;
+void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo,
+                  const int expo_len) {
+  // mask of the most significant bit
+  const limb_t msb_mask = (limb_t)1 << ((sizeof(limb_t) << 3) - 1);
+  limb_t mask = msb_mask;
+  int index = 0;
+
+  expo += expo_len;
+  // process most significant zero limbs
+  while ((index < expo_len) && (*(--expo) == 0)) {
+    index++;
+  }
+  // if expo is zero
+  if (index == expo_len) {
+    Fr_copy(res, base);
+    return;
+  }
+  // expo is non zero
+  // process the most significant zero bits
+  while ((*expo & mask) == 0) {
+    mask >>= 1;
+  }
+  Fr tmp;
+  // process the first `1` bit
+  Fr_copy(&tmp, base);
+  mask >>= 1;
+  // Scan all limbs of the exponent
+  for (; index < expo_len; expo--) {
+    // Scan all bits
+    for (; mask != 0; mask >>= 1) {
+      // square
+      Fr_squ_montg(&tmp, &tmp);
+      // multiply
+      if (*expo & mask) {
+        Fr_mul_montg(&tmp, &tmp, base);
+      }
     }
-    Fr tmp;
-	// process the first `1` bit
-	Fr_copy(&tmp, base);
-	mask >>= 1;
-	// Scan all limbs of the exponent
-	for ( ; index < expo_len; expo--) {
-		// Scan all bits 
-		for ( ; mask != 0 ; mask >>= 1 ) {
-			// square
-			Fr_squ_montg(&tmp, &tmp);
-			// multiply
-			if (*expo & mask) {
-				Fr_mul_montg(&tmp, &tmp ,base);
-			}
-		}
-		mask = msb_mask;
-        index++;
-	}
-    Fr_copy(res, &tmp);
+    mask = msb_mask;
+    index++;
+  }
+  Fr_copy(res, &tmp);
 }
 
 // TODO: could be deleted
 void Fr_inv_exp_montg(Fr *res, const Fr *a) {
-    Fr r_2;
-    Fr_copy(&r_2, (Fr*)BLS12_381_r);
-    r_2.limbs[0] -= 2;
-    Fr_exp_montg(res, a, (limb_t*)&r_2, 4);
+  Fr r_2;
+  Fr_copy(&r_2, (Fr *)BLS12_381_r);
+  r_2.limbs[0] -= 2;
+  Fr_exp_montg(res, a, (limb_t *)&r_2, 4);
 }
 
 // computes the sum of the array elements and writes the sum in jointx
-void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) {
-    Fr_set_zero(jointx);
-    for (int i=0; i<len; i++) {
-        Fr_add(jointx, jointx, &x[i]);
-    }
+void Fr_sum_vector(Fr *jointx, const Fr x[], const int len) {
+  Fr_set_zero(jointx);
+  for (int i = 0; i < len; i++) {
+    Fr_add(jointx, jointx, &x[i]);
+  }
 }
 
 // internal type of BLST `pow256` uses bytes little endian.
 // input is bytes big endian as used by Flow crypto lib external scalars.
-static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES])
-{
-    byte* b = (byte*)a + Fr_BYTES - 1;
-    if ((uptr_t)ret == (uptr_t)a) { // swap in place
-        for (int i=0; i<Fr_BYTES/2; i++) {
-            byte tmp = *ret;
-            *(ret++) = *b;
-            *(b--) = tmp;
-        }
-        return;
-    }
-    for (int i=0; i<Fr_BYTES; i++) {
-        *(ret++) = *(b--);
+static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) {
+  byte *b = (byte *)a + Fr_BYTES - 1;
+  if ((uptr_t)ret == (uptr_t)a) { // swap in place
+    for (int i = 0; i < Fr_BYTES / 2; i++) {
+      byte tmp = *ret;
+      *(ret++) = *b;
+      *(b--) = tmp;
     }
+    return;
+  }
+  for (int i = 0; i < Fr_BYTES; i++) {
+    *(ret++) = *(b--);
+  }
 }
 
 // internal type of BLST `pow256` uses bytes little endian.
-static void pow256_from_Fr(pow256 ret, const Fr* in) {
-    le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES);
+static void pow256_from_Fr(pow256 ret, const Fr *in) {
+  le_bytes_from_limbs(ret, (limb_t *)in, Fr_BYTES);
 }
 
 // reads a scalar in `a` and checks it is a valid Fr element (a < r).
@@ -182,20 +186,22 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) {
 // returns:
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fr
-//    - VALID if the scalar is valid 
-ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
-    if (len != Fr_BYTES) {
-        return BAD_ENCODING;
-    }
-    pow256 tmp;
-    // compare to r using the provided tool from BLST 
-    pow256_from_be_bytes(tmp, bin);  // TODO: check endianness!!
-    if (!check_mod_256(tmp, BLS12_381_r)) {  // check_mod_256 compares pow256 against a vec256!
-        return BAD_VALUE;
-    }
-    vec_zero(tmp, sizeof(tmp));
-    limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); // TODO: check endianness!!
-    return VALID;
+//    - VALID if the scalar is valid
+ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) {
+  if (len != Fr_BYTES) {
+    return BAD_ENCODING;
+  }
+  pow256 tmp;
+  // compare to r using the provided tool from BLST
+  pow256_from_be_bytes(tmp, bin); // TODO: check endianness!!
+  if (!check_mod_256(
+          tmp,
+          BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
+    return BAD_VALUE;
+  }
+  vec_zero(tmp, sizeof(tmp));
+  limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES); // TODO: check endianness!!
+  return VALID;
 }
 
 // reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r).
@@ -203,134 +209,135 @@ ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) {
 // returns:
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fr_star
-//    - VALID if the scalar is valid 
-ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) {
-    int ret = Fr_read_bytes(a, bin, len);
-    if (ret != VALID) {
-        return ret;
-    }
-    // check if a=0
-    if (Fr_is_zero(a)) {
-        return BAD_VALUE;
-    }
-    return VALID;
+//    - VALID if the scalar is valid
+ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) {
+  int ret = Fr_read_bytes(a, bin, len);
+  if (ret != VALID) {
+    return ret;
+  }
+  // check if a=0
+  if (Fr_is_zero(a)) {
+    return BAD_VALUE;
+  }
+  return VALID;
 }
 
 // write Fr element `a` in big endian bytes.
-void Fr_write_bytes(byte *bin, const Fr* a) {
-    // be_bytes_from_limbs works for both limb endiannesses
-    be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES);
+void Fr_write_bytes(byte *bin, const Fr *a) {
+  // be_bytes_from_limbs works for both limb endiannesses
+  be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES);
 }
 
 // maps big-endian bytes into an Fr element using modular reduction
 // Input is byte-big-endian, output is Fr (internally vec256)
-// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0);
-static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) {
-    Fr digit, radix;
-    Fr_set_zero(out);
-    Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2
-
-    byte* p = (byte*)bytes + n;
-    while (n > Fr_BYTES) {
-        // limbs_from_be_bytes works for both limb endiannesses
-        limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
-        Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i  (i is the loop number starting at 1)
-        Fr_add(out, out, &digit);
-        Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1)
-        n -= Fr_BYTES;
-    }
-    Fr_set_zero(&digit);
-    limbs_from_be_bytes((limb_t*)&digit, p - n, n);
-    Fr_mul_montg(&digit, &digit, &radix);
+// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t
+// n0);
+static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
+  Fr digit, radix;
+  Fr_set_zero(out);
+  Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2
+
+  byte *p = (byte *)bytes + n;
+  while (n > Fr_BYTES) {
+    // limbs_from_be_bytes works for both limb endiannesses
+    limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
+    Fr_mul_montg(&digit, &digit,
+                 &radix); // l_i * R^i  (i is the loop number starting at 1)
     Fr_add(out, out, &digit);
-    // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n
-    // reduce the extra R
-    Fr_from_montg(out, out);
-    // clean up possible sensitive data
-    Fr_set_zero(&digit);
+    Fr_mul_montg(&radix, &radix, (Fr *)BLS12_381_rRR); // R^(i+1)
+    n -= Fr_BYTES;
+  }
+  Fr_set_zero(&digit);
+  limbs_from_be_bytes((limb_t *)&digit, p - n, n);
+  Fr_mul_montg(&digit, &digit, &radix);
+  Fr_add(out, out, &digit);
+  // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n
+  // reduce the extra R
+  Fr_from_montg(out, out);
+  // clean up possible sensitive data
+  Fr_set_zero(&digit);
 }
 
 // Reads a scalar from an array and maps it to Fr using modular reduction.
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
-bool map_bytes_to_Fr(Fr* a, const byte* bin, int len) {
-    Fr_from_be_bytes(a, bin, len);
-    return Fr_is_zero(a);
+bool map_bytes_to_Fr(Fr *a, const byte *bin, int len) {
+  Fr_from_be_bytes(a, bin, len);
+  return Fr_is_zero(a);
 }
 
 // ------------------- Fp utilities
 
 // Montgomery constants related to the prime p
-const Fp BLS12_381_pR = { ONE_MONT_P };        /* R mod p = (1<<384)%p */
+const Fp BLS12_381_pR = {ONE_MONT_P}; /* R mod p = (1<<384)%p */
 
 // sets `a` to 0
-static void Fp_set_zero(Fp* a){
-    vec_zero((byte*)a, sizeof(Fp));
-}
+static void Fp_set_zero(Fp *a) { vec_zero((byte *)a, sizeof(Fp)); }
 
 // sets `a` to limb `l`
-static void Fp_set_limb(Fp* a, const limb_t l){
-    vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t));
-    *((limb_t*)a) = l;
+static void Fp_set_limb(Fp *a, const limb_t l) {
+  vec_zero((byte *)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t));
+  *((limb_t *)a) = l;
 }
 
-void Fp_copy(Fp* res, const Fp* a) {
-    if ((uptr_t)a==(uptr_t)res) {
-        return;
-    }
-    vec_copy((byte*)res, (byte*)a, sizeof(Fp));
+void Fp_copy(Fp *res, const Fp *a) {
+  if ((uptr_t)a == (uptr_t)res) {
+    return;
+  }
+  vec_copy((byte *)res, (byte *)a, sizeof(Fp));
 }
 
 static void Fp_add(Fp *res, const Fp *a, const Fp *b) {
-    add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
+  add_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P);
 }
 
 static void Fp_sub(Fp *res, const Fp *a, const Fp *b) {
-    sub_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P);
+  sub_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P);
 }
 
 static void Fp_neg(Fp *res, const Fp *a) {
-    cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P);
+  cneg_mod_384((limb_t *)res, (limb_t *)a, 1, BLS12_381_P);
 }
 
-// checks if `a` is a quadratic residue in Fp. If yes, it computes 
+// checks if `a` is a quadratic residue in Fp. If yes, it computes
 // the square root in `res`.
-// 
+//
 // The boolean output is valid whether `a` is in Montgomery form or not,
 // since montgomery constant `R` is a quadratic residue.
 // However, the square root is valid only if `a` is in montgomery form.
-static bool Fp_sqrt_montg(Fp *res, const Fp* a) {
-   return sqrt_fp((limb_t*)res, (limb_t*)a);
+static bool Fp_sqrt_montg(Fp *res, const Fp *a) {
+  return sqrt_fp((limb_t *)res, (limb_t *)a);
 }
 
-static bool Fp_check(const Fp* in) {
-    // use same method as in BLST internal function
-    // which seems the most efficient. The method uses the assembly-based 
-    // modular addition instead of limbs comparison
-    Fp temp;
-    Fp_add(&temp, in, &ZERO_384); 
-    return vec_is_equal(&temp, in, Fp_BYTES);
-    // no need to clear `tmp` as no use-case involves sensitive data being passed as `in`
+static bool Fp_check(const Fp *in) {
+  // use same method as in BLST internal function
+  // which seems the most efficient. The method uses the assembly-based
+  // modular addition instead of limbs comparison
+  Fp temp;
+  Fp_add(&temp, in, &ZERO_384);
+  return vec_is_equal(&temp, in, Fp_BYTES);
+  // no need to clear `tmp` as no use-case involves sensitive data being passed
+  // as `in`
 }
 
 // res = a*b*R^(-1)
 void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) {
-    mul_mont_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P, p0);
+  mul_mont_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P, p0);
 }
 
 // res = a^2 * R^(-1)
 void Fp_squ_montg(Fp *res, const Fp *a) {
-    sqr_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0);
+  sqr_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0);
 }
 
 // res = a*R
 void Fp_to_montg(Fp *res, const Fp *a) {
-    mul_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_RR, BLS12_381_P, p0);
+  mul_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_RR, BLS12_381_P, p0);
 }
 
 // res = a*R^(-1)
 void Fp_from_montg(Fp *res, const Fp *a) {
-    from_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0);
+  from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0);
 }
 
 // reads a scalar in `a` and checks it is a valid Fp element (a < p).
@@ -338,82 +345,81 @@ void Fp_from_montg(Fp *res, const Fp *a) {
 // returns:
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fp
-//    - VALID if the scalar is valid 
-ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) {
-    if (len != Fp_BYTES) {
-        return BAD_ENCODING;
-    }
-    limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES);
-    // compare read scalar to p
-    if (!Fp_check(a)) {
-        return BAD_VALUE;
-    }       
-    return VALID;
+//    - VALID if the scalar is valid
+ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) {
+  if (len != Fp_BYTES) {
+    return BAD_ENCODING;
+  }
+  limbs_from_be_bytes((limb_t *)a, bin, Fp_BYTES);
+  // compare read scalar to p
+  if (!Fp_check(a)) {
+    return BAD_VALUE;
+  }
+  return VALID;
 }
 
-
-// write Fp element to bin and assume `bin` has  `Fp_BYTES` allocated bytes.  
-void Fp_write_bytes(byte *bin, const Fp* a) {
-    be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES);
+// write Fp element to bin and assume `bin` has  `Fp_BYTES` allocated bytes.
+void Fp_write_bytes(byte *bin, const Fp *a) {
+  be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES);
 }
 
 // returns the sign of y.
 // 1 if y > (p - 1)/2 and 0 otherwise.
 // y is in montgomery form
-static byte Fp_get_sign(const Fp* y) {
-    // BLST's sgn0_pty_mont_384 requires input to be in Montg form.
-    // The needed sign bit is on position 1 !
-    return (sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0)>>1) & 1;
+static byte Fp_get_sign(const Fp *y) {
+  // BLST's sgn0_pty_mont_384 requires input to be in Montg form.
+  // The needed sign bit is on position 1 !
+  return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1;
 }
 
 // ------------------- Fp^2 utilities
 
 // sets `a` to limb `l`
-static void Fp2_set_limb(Fp2* a, const limb_t l){
-    Fp_set_limb(&real(a), l);  
-    Fp_set_zero(&imag(a));
+static void Fp2_set_limb(Fp2 *a, const limb_t l) {
+  Fp_set_limb(&real(a), l);
+  Fp_set_zero(&imag(a));
 }
 
 static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
+  add_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P);
 }
 
 static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P);
+  sub_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P);
 }
 
 static void Fp2_neg(Fp2 *res, const Fp2 *a) {
-    cneg_mod_384(real(res), real(a), 1, BLS12_381_P);
-    cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P);
+  cneg_mod_384(real(res), real(a), 1, BLS12_381_P);
+  cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P);
 }
 
 // res = a*b in montgomery form
 static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) {
-    mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); 
+  mul_mont_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P, p0);
 }
 
 // res = a^2 in montgomery form
 static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
-    sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); 
+  sqr_mont_384x((vec384 *)res, (vec384 *)a, BLS12_381_P, p0);
 }
 
-// checks if `a` is a quadratic residue in Fp^2. If yes, it computes 
+// checks if `a` is a quadratic residue in Fp^2. If yes, it computes
 // the square root in `res`.
-// 
+//
 // The boolean output is valid whether `a` is in Montgomery form or not,
 // since montgomery constant `R` is a quadratic residue.
 // However, the square root is valid only if `a` is in montgomery form.
-static bool Fp2_sqrt_montg(Fp2 *res, const Fp2* a) {
-   return sqrt_fp2((vec384*)res, (vec384*)a);
+static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) {
+  return sqrt_fp2((vec384 *)res, (vec384 *)a);
 }
 
 // returns the sign of y.
 // sign(y_0) if y_1 = 0, else sign(y_1)
 // y coordinates must be in montgomery form
-static byte Fp2_get_sign(Fp2* y) {
-    // BLST's sgn0_pty_mont_384x requires input to be in Montg form.
-    // The needed sign bit is on position 1 !
-    return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1;
+static byte Fp2_get_sign(Fp2 *y) {
+  // BLST's sgn0_pty_mont_384x requires input to be in Montg form.
+  // The needed sign bit is on position 1 !
+  return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1;
 }
 
 // reads an Fp^2 element in `a`.
@@ -422,745 +428,762 @@ static byte Fp2_get_sign(Fp2* y) {
 // returns:
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fp
-//    - VALID if the scalar is valid 
-static ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) {
-    if (len != Fp2_BYTES) {
-        return BAD_ENCODING;
-    }
-    ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
-    if (ret != VALID) {
-        return ret;
-    }
-    ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES);
-    if ( ret != VALID) {
-        return ret;
-    }
-    return VALID;
-}
-
-// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes.  
-void Fp2_write_bytes(byte *bin, const Fp2* a) {
-    Fp_write_bytes(bin, &real(a));
-    Fp_write_bytes(bin + Fp_BYTES, &imag(a));
+//    - VALID if the scalar is valid
+static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) {
+  if (len != Fp2_BYTES) {
+    return BAD_ENCODING;
+  }
+  ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
+  if (ret != VALID) {
+    return ret;
+  }
+  ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES);
+  if (ret != VALID) {
+    return ret;
+  }
+  return VALID;
+}
+
+// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes.
+void Fp2_write_bytes(byte *bin, const Fp2 *a) {
+  Fp_write_bytes(bin, &real(a));
+  Fp_write_bytes(bin + Fp_BYTES, &imag(a));
 }
 
 // ------------------- E1 utilities
 
-void E1_copy(E1* res, const E1* p) {
-    if ((uptr_t)p == (uptr_t)res) {
-        return;
-    }
-    vec_copy(res, p, sizeof(E1));
+void E1_copy(E1 *res, const E1 *p) {
+  if ((uptr_t)p == (uptr_t)res) {
+    return;
+  }
+  vec_copy(res, p, sizeof(E1));
 }
 
 // checks p1 == p2
-bool E1_is_equal(const E1* p1, const E1* p2) {
-    // `POINTonE1_is_equal` includes the infinity case
-    return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2);
+bool E1_is_equal(const E1 *p1, const E1 *p2) {
+  // `POINTonE1_is_equal` includes the infinity case
+  return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2);
 }
 
 // compare p to infinity
-bool E1_is_infty(const E1* p) {
-    // BLST infinity points are defined by Z=0
-    return vec_is_zero(p->z, sizeof(p->z));  
+bool E1_is_infty(const E1 *p) {
+  // BLST infinity points are defined by Z=0
+  return vec_is_zero(p->z, sizeof(p->z));
 }
 
 // set p to infinity
-void E1_set_infty(E1* p) {
-    // BLST infinity points are defined by Z=0
-    vec_zero(p->z, sizeof(p->z));  
+void E1_set_infty(E1 *p) {
+  // BLST infinity points are defined by Z=0
+  vec_zero(p->z, sizeof(p->z));
 }
 
 // converts an E1 point from Jacobian into affine coordinates (z=1)
-void E1_to_affine(E1* res, const E1* p) {
-    // optimize in case coordinates are already affine
-    if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) {
-        E1_copy(res, p);
-        return;
-    }
-    // convert from Jacobian
-    POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p);   
+void E1_to_affine(E1 *res, const E1 *p) {
+  // optimize in case coordinates are already affine
+  if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) {
+    E1_copy(res, p);
+    return;
+  }
+  // convert from Jacobian
+  POINTonE1_from_Jacobian((POINTonE1 *)res, (const POINTonE1 *)p);
 }
 
 // checks affine point `p` is in E1
-bool E1_affine_on_curve(const E1* p) {
-    // BLST's `POINTonE1_affine_on_curve` does not include the inifity case!
-    return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p);
+bool E1_affine_on_curve(const E1 *p) {
+  // BLST's `POINTonE1_affine_on_curve` does not include the inifity case!
+  return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p);
 }
 
 // checks if input E1 point is on the subgroup G1.
 // It assumes input `p` is on E1.
-bool E1_in_G1(const E1* p){
-    // currently uses Scott method
-    return POINTonE1_in_G1((const POINTonE1*)p);
+bool E1_in_G1(const E1 *p) {
+  // currently uses Scott method
+  return POINTonE1_in_G1((const POINTonE1 *)p);
 }
 
-// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form.
-// The resulting point is guaranteed to be on curve E1 (no G1 check is included).
-// Expected serialization follows:
+// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or
+// uncompressed form. The resulting point is guaranteed to be on curve E1 (no G1
+// check is included). Expected serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
 //
 // returns:
-//    - BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BAD_ENCODING if the length is invalid or serialization header bits are
+//    invalid
 //    - BAD_VALUE if Fp coordinates couldn't deserialize
 //    - POINT_NOT_ON_CURVE if deserialized point isn't on E1
-//    - VALID if deserialization is valid 
+//    - VALID if deserialization is valid
 
-// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, 
+// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z,
 //       and update logic with G2 subgroup check?
-ERROR E1_read_bytes(E1* a, const byte *bin, const int len) {
-    // check the length
-    if (len != G1_SER_BYTES) {
-        return BAD_ENCODING;
+ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
+  // check the length
+  if (len != G1_SER_BYTES) {
+    return BAD_ENCODING;
+  }
+
+  // check the compression bit
+  int compressed = bin[0] >> 7;
+  if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
+    return BAD_ENCODING;
+  }
+
+  // check if the point in infinity
+  int is_infinity = bin[0] & 0x40;
+  if (is_infinity) {
+    // the remaining bits need to be cleared
+    if (bin[0] & 0x3F) {
+      return BAD_ENCODING;
     }
-
-    // check the compression bit
-    int compressed = bin[0] >> 7;
-    if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
-        return BAD_ENCODING;
-    } 
-
-    // check if the point in infinity
-    int is_infinity = bin[0] & 0x40;
-    if (is_infinity) {
-        // the remaining bits need to be cleared
-        if (bin[0] & 0x3F) {
-            return BAD_ENCODING;
-        }
-        for (int i=1; i<G1_SER_BYTES-1; i++) {
-            if (bin[i]) {
-                return BAD_ENCODING;
-            } 
-        }
-		E1_set_infty(a);
-		return VALID;
-	} 
-
-    // read the sign bit and check for consistency
-    int y_sign = (bin[0] >> 5) & 1;
-    if (y_sign && (!compressed)) {
+    for (int i = 1; i < G1_SER_BYTES - 1; i++) {
+      if (bin[i]) {
         return BAD_ENCODING;
-    } 
-    
-    // use a temporary buffer to mask the header bits and read a.x
-    byte temp[Fp_BYTES];
-    memcpy(temp, bin, Fp_BYTES);
-    temp[0] &= 0x1F;        // clear the header bits
-    ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
-    if (ret != VALID) {
-        return ret;
-    }
-    Fp_to_montg(&a->x, &a->x);
-
-    // set a.z to 1
-    Fp_copy(&a->z, &BLS12_381_pR);
-
-    if (G1_SERIALIZATION == UNCOMPRESSED) {
-        ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y));
-        if (ret != VALID){ 
-            return ret;
-        }
-        Fp_to_montg(&a->y, &a->y);
-        // check read point is on curve
-        if (!E1_affine_on_curve(a)) { 
-            return POINT_NOT_ON_CURVE;
-        }
-        return VALID;
+      }
     }
-    
-    // compute the possible square root
-    Fp_squ_montg(&a->y, &a->x);
-    Fp_mul_montg(&a->y, &a->y, &a->x);    // x^3
-    Fp_add(&a->y, &a->y, &B_E1);          // B_E1 is already in Montg form             
-    if (!Fp_sqrt_montg(&a->y, &a->y)) {    // check whether x^3+b is a quadratic residue
-        return POINT_NOT_ON_CURVE; 
+    E1_set_infty(a);
+    return VALID;
+  }
+
+  // read the sign bit and check for consistency
+  int y_sign = (bin[0] >> 5) & 1;
+  if (y_sign && (!compressed)) {
+    return BAD_ENCODING;
+  }
+
+  // use a temporary buffer to mask the header bits and read a.x
+  byte temp[Fp_BYTES];
+  memcpy(temp, bin, Fp_BYTES);
+  temp[0] &= 0x1F; // clear the header bits
+  ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
+  if (ret != VALID) {
+    return ret;
+  }
+  Fp_to_montg(&a->x, &a->x);
+
+  // set a.z to 1
+  Fp_copy(&a->z, &BLS12_381_pR);
+
+  if (G1_SERIALIZATION == UNCOMPRESSED) {
+    ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y));
+    if (ret != VALID) {
+      return ret;
     }
-
-    // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
-    if (Fp_get_sign(&a->y) != y_sign) {
-        Fp_neg(&a->y, &a->y); // flip y sign if needed
+    Fp_to_montg(&a->y, &a->y);
+    // check read point is on curve
+    if (!E1_affine_on_curve(a)) {
+      return POINT_NOT_ON_CURVE;
     }
     return VALID;
-}
-
-// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form.
-// It assumes buffer is of length G1_SER_BYTES
-// The serialization follows:
+  }
+
+  // compute the possible square root
+  Fp_squ_montg(&a->y, &a->x);
+  Fp_mul_montg(&a->y, &a->y, &a->x); // x^3
+  Fp_add(&a->y, &a->y, &B_E1);       // B_E1 is already in Montg form
+  if (!Fp_sqrt_montg(&a->y,
+                     &a->y)) { // check whether x^3+b is a quadratic residue
+    return POINT_NOT_ON_CURVE;
+  }
+
+  // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
+  if (Fp_get_sign(&a->y) != y_sign) {
+    Fp_neg(&a->y, &a->y); // flip y sign if needed
+  }
+  return VALID;
+}
+
+// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or
+// uncompressed form. It assumes buffer is of length G1_SER_BYTES The
+// serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-void E1_write_bytes(byte *bin, const E1* a) {
-    if (E1_is_infty(a)) {
-            // set the infinity bit
-            bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);
-            memset(bin+1, 0, G1_SER_BYTES-1);
-            return;
-    }
-    E1 tmp;
-    E1_to_affine(&tmp, a);
-
-    Fp_from_montg(&tmp.x, &tmp.x);
-    Fp_write_bytes(bin, &tmp.x);
-
-    if (G1_SERIALIZATION == COMPRESSED) {
-        bin[0] |= (Fp_get_sign(&tmp.y) << 5);
-    } else {
-        Fp_from_montg(&tmp.y, &tmp.y);
-        Fp_write_bytes(bin + Fp_BYTES, &tmp.y);
-    }
-    // compression bit
-    bin[0] |= (G1_SERIALIZATION << 7);
+void E1_write_bytes(byte *bin, const E1 *a) {
+  if (E1_is_infty(a)) {
+    // set the infinity bit
+    bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);
+    memset(bin + 1, 0, G1_SER_BYTES - 1);
+    return;
+  }
+  E1 tmp;
+  E1_to_affine(&tmp, a);
+
+  Fp_from_montg(&tmp.x, &tmp.x);
+  Fp_write_bytes(bin, &tmp.x);
+
+  if (G1_SERIALIZATION == COMPRESSED) {
+    bin[0] |= (Fp_get_sign(&tmp.y) << 5);
+  } else {
+    Fp_from_montg(&tmp.y, &tmp.y);
+    Fp_write_bytes(bin + Fp_BYTES, &tmp.y);
+  }
+  // compression bit
+  bin[0] |= (G1_SERIALIZATION << 7);
 }
 
 // generic point addition that must handle doubling and points at infinity
-void E1_add(E1* res, const E1* a, const E1* b) {
-    POINTonE1_dadd((POINTonE1*)res, (POINTonE1*)a, (POINTonE1*)b, NULL); 
+void E1_add(E1 *res, const E1 *a, const E1 *b) {
+  POINTonE1_dadd((POINTonE1 *)res, (POINTonE1 *)a, (POINTonE1 *)b, NULL);
 }
 
 // Point negation: res = -a
-void E1_neg(E1* res, const E1* a) {
-    E1_copy(res, a);
-    POINTonE1_cneg((POINTonE1*)res, 1);
+void E1_neg(E1 *res, const E1 *a) {
+  E1_copy(res, a);
+  POINTonE1_cneg((POINTonE1 *)res, 1);
 }
 
 // Exponentiation of a generic point `a` in E1, res = expo.a
-void E1_mult(E1* res, const E1* p, const Fr* expo) {
-    pow256 tmp;
-    pow256_from_Fr(tmp, expo);
-    POINTonE1_mult_glv((POINTonE1*)res, (POINTonE1*)p, tmp);
-    vec_zero(&tmp, sizeof(tmp));
+void E1_mult(E1 *res, const E1 *p, const Fr *expo) {
+  pow256 tmp;
+  pow256_from_Fr(tmp, expo);
+  POINTonE1_mult_glv((POINTonE1 *)res, (POINTonE1 *)p, tmp);
+  vec_zero(&tmp, sizeof(tmp));
 }
 
 // computes the sum of the E1 array elements `y[i]` and writes it in `sum`.
-void E1_sum_vector(E1* sum, const E1* y, const int len){
-    E1_set_infty(sum);
-    for (int i=0; i<len; i++){
-        E1_add(sum, sum, &y[i]);
-    }
-}
-
-// Computes the sum of input signatures (E1 elements) flattened in a single byte array
-// `sigs_bytes` of `sigs_len` bytes.
-// and writes the sum (E1 element) as bytes in `dest`.
-// The function does not check membership of E1 inputs in G1 subgroup.
-// The header is using byte pointers to minimize Cgo calls from the Go layer.
-int E1_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int sigs_len) {
-    int error = UNDEFINED;
-    // sanity check that `len` is multiple of `G1_SER_BYTES`
-    if (sigs_len % G1_SER_BYTES) {
-        error =  INVALID; 
-        goto mem_error;
-    }
-    int n = sigs_len/G1_SER_BYTES; // number of signatures
-    
-    E1* sigs = (E1*) malloc(n * sizeof(E1));
-    if (!sigs) goto mem_error;
-
-    // import the points from the array
-    for (int i=0; i < n; i++) {
-        // deserialize each point from the input array
-        if  (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != VALID) {
-            error = INVALID; 
-            goto out;
-        }
+void E1_sum_vector(E1 *sum, const E1 *y, const int len) {
+  E1_set_infty(sum);
+  for (int i = 0; i < len; i++) {
+    E1_add(sum, sum, &y[i]);
+  }
+}
+
+// Computes the sum of input signatures (E1 elements) flattened in a single byte
+// array `sigs_bytes` of `sigs_len` bytes. and writes the sum (E1 element) as
+// bytes in `dest`. The function does not check membership of E1 inputs in G1
+// subgroup. The header is using byte pointers to minimize Cgo calls from the Go
+// layer.
+int E1_sum_vector_byte(byte *dest, const byte *sigs_bytes, const int sigs_len) {
+  int error = UNDEFINED;
+  // sanity check that `len` is multiple of `G1_SER_BYTES`
+  if (sigs_len % G1_SER_BYTES) {
+    error = INVALID;
+    goto mem_error;
+  }
+  int n = sigs_len / G1_SER_BYTES; // number of signatures
+
+  E1 *sigs = (E1 *)malloc(n * sizeof(E1));
+  if (!sigs)
+    goto mem_error;
+
+  // import the points from the array
+  for (int i = 0; i < n; i++) {
+    // deserialize each point from the input array
+    if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES) !=
+        VALID) {
+      error = INVALID;
+      goto out;
     }
-    // sum the points
-    E1 acc;        
-    E1_sum_vector(&acc, sigs, n);
-    // export the result
-    E1_write_bytes(dest, &acc);
-    error = VALID;
+  }
+  // sum the points
+  E1 acc;
+  E1_sum_vector(&acc, sigs, n);
+  // export the result
+  E1_write_bytes(dest, &acc);
+  error = VALID;
 out:
-    free(sigs);
+  free(sigs);
 mem_error:
-    return error;
+  return error;
 }
 
 // Exponentiation of generator g1 of G1, res = expo.g1
-void G1_mult_gen(E1* res, const Fr* expo) {
-    pow256 tmp;
-    pow256_from_Fr(tmp, expo);
-    POINTonE1_mult_glv((POINTonE1*)res, &BLS12_381_G1, tmp);
-    vec_zero(&tmp, sizeof(tmp));
+void G1_mult_gen(E1 *res, const Fr *expo) {
+  pow256 tmp;
+  pow256_from_Fr(tmp, expo);
+  POINTonE1_mult_glv((POINTonE1 *)res, &BLS12_381_G1, tmp);
+  vec_zero(&tmp, sizeof(tmp));
 }
 
- 
 // Reads a scalar bytes and maps it to Fp using modular reduction.
-// output is in Montgomery form. 
+// output is in Montgomery form.
 // `len` must be less or equal to 96 bytes and must be a multiple of 8.
 // This function is only used by `map_to_G1` where input is 64 bytes.
 // input `len` is not checked to satisfy the conditions above.
-static void map_96_bytes_to_Fp(Fp* a, const byte* bin, int len) {
-    vec768 tmp ;
-    vec_zero(&tmp, sizeof(tmp));
-    limbs_from_be_bytes((limb_t*)tmp, bin, len);
-    redc_mont_384((limb_t*)a, tmp, BLS12_381_P, p0); // aR^(-2)
-    Fp_mul_montg(a, a, (Fp*)BLS12_381_RRRR); // aR
+static void map_96_bytes_to_Fp(Fp *a, const byte *bin, int len) {
+  vec768 tmp;
+  vec_zero(&tmp, sizeof(tmp));
+  limbs_from_be_bytes((limb_t *)tmp, bin, len);
+  redc_mont_384((limb_t *)a, tmp, BLS12_381_P, p0); // aR^(-2)
+  Fp_mul_montg(a, a, (Fp *)BLS12_381_RRRR);         // aR
 }
 
 // maps bytes input `hash` to G1.
 // `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
 // It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
-int map_to_G1(E1* h, const byte* hash, const int len) {
-    // sanity check of length
-    if (len != MAP_TO_G1_INPUT_LEN) {
-        return INVALID;
-    }
-    // map to field elements
-    Fp u[2];
-    map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN/2);
-    map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2);
-    // map field elements to G1
-    // inputs must be in Montgomery form
-    map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]);
-    return VALID;
+int map_to_G1(E1 *h, const byte *hash, const int len) {
+  // sanity check of length
+  if (len != MAP_TO_G1_INPUT_LEN) {
+    return INVALID;
+  }
+  // map to field elements
+  Fp u[2];
+  map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN / 2);
+  map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN / 2,
+                     MAP_TO_G1_INPUT_LEN / 2);
+  // map field elements to G1
+  // inputs must be in Montgomery form
+  map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]);
+  return VALID;
 }
 
 // maps the bytes to a point in G1.
 // `len` should be at least Fr_BYTES.
 // this is a testing file only, should not be used in any protocol!
-void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) {
-    assert(len >= Fr_BYTES);
-    // map to Fr
-    Fr log;
-    map_bytes_to_Fr(&log, bytes, len);
-    // multiplies G1 generator by a random scalar
-    G1_mult_gen(p, &log);
+void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) {
+  assert(len >= Fr_BYTES);
+  // map to Fr
+  Fr log;
+  map_bytes_to_Fr(&log, bytes, len);
+  // multiplies G1 generator by a random scalar
+  G1_mult_gen(p, &log);
 }
 
-// maps bytes to a point in E1\G1. 
+// maps bytes to a point in E1\G1.
 // `len` must be at least 96 bytes.
 // this is a testing file only, should not be used in any protocol!
-void unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) {
-    assert(len >= 96);
-    Fp u;
-    map_96_bytes_to_Fp(&u, bytes, 96);
-    // map to E1's isogenous and then to E1
-    map_to_isogenous_E1((POINTonE1 *)p, u);
-    isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p);
-    // clear G1 order
-    E1_mult(p, p, (Fr*)&BLS12_381_r);
+void unsafe_map_bytes_to_G1complement(E1 *p, const byte *bytes, int len) {
+  assert(len >= 96);
+  Fp u;
+  map_96_bytes_to_Fp(&u, bytes, 96);
+  // map to E1's isogenous and then to E1
+  map_to_isogenous_E1((POINTonE1 *)p, u);
+  isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p);
+  // clear G1 order
+  E1_mult(p, p, (Fr *)&BLS12_381_r);
 }
 
 // ------------------- E2 utilities
 
-const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2;
-const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2;
+const E2 *BLS12_381_g2 = (const E2 *)&BLS12_381_G2;
+const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2;
 
-// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form.
-// The resulting point is guaranteed to be on curve E2 (no G2 check is included).
+// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or
+// uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2
+// check is included).
 //
 // returns:
-//    - BAD_ENCODING if the length is invalid or serialization header bits are invalid
+//    - BAD_ENCODING if the length is invalid or serialization header bits are
+//    invalid
 //    - BAD_VALUE if Fp^2 coordinates couldn't deserialize
 //    - POINT_NOT_ON_CURVE if deserialized point isn't on E2
-//    - VALID if deserialization is valid 
+//    - VALID if deserialization is valid
 
-// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, 
+// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z,
 //       and update logic with G2 subgroup check?
-ERROR E2_read_bytes(E2* a, const byte *bin, const int len) {
-    // check the length
-    if (len != G2_SER_BYTES) {
-        return BAD_ENCODING;
+ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
+  // check the length
+  if (len != G2_SER_BYTES) {
+    return BAD_ENCODING;
+  }
+
+  // check the compression bit
+  int compressed = bin[0] >> 7;
+  if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
+    return BAD_ENCODING;
+  }
+
+  // check if the point in infinity
+  int is_infinity = bin[0] & 0x40;
+  if (is_infinity) {
+    // the remaining bits need to be cleared
+    if (bin[0] & 0x3F) {
+      return BAD_ENCODING;
     }
-
-    // check the compression bit
-    int compressed = bin[0] >> 7;
-    if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
-        return BAD_ENCODING;
-    } 
-
-    // check if the point in infinity
-    int is_infinity = bin[0] & 0x40;
-    if (is_infinity) {
-        // the remaining bits need to be cleared
-        if (bin[0] & 0x3F) {
-            return BAD_ENCODING;
-        }
-        for (int i=1; i<G2_SER_BYTES-1; i++) {
-            if (bin[i]) {
-                return BAD_ENCODING;
-            } 
-        }
-		E2_set_infty(a);
-		return VALID;
-	} 
-
-    // read the sign bit and check for consistency
-    int y_sign = (bin[0] >> 5) & 1;
-    if (y_sign && (!compressed)) {
+    for (int i = 1; i < G2_SER_BYTES - 1; i++) {
+      if (bin[i]) {
         return BAD_ENCODING;
-    } 
-    
-    // use a temporary buffer to mask the header bits and read a.x
-    byte temp[Fp2_BYTES];
-    memcpy(temp, bin, Fp2_BYTES);
-    temp[0] &= 0x1F;        // clear the header bits
-    ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
-    if (ret != VALID) {
-        return ret;
+      }
     }
-    Fp2* a_x = &(a->x);
-    Fp_to_montg(&real(a_x), &real(a_x));
-    Fp_to_montg(&imag(a_x), &imag(a_x));
-
-    // set a.z to 1
-    Fp2* a_z = &(a->z);
-    Fp_copy(&real(a_z), &BLS12_381_pR);
-    Fp_set_zero(&imag(a_z));   
-
-    Fp2* a_y = &(a->y);
-    if (G2_SERIALIZATION == UNCOMPRESSED) {
-        ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y));
-        if (ret != VALID){ 
-            return ret;
-        }
-        Fp_to_montg(&real(a_y), &real(a_y));
-        Fp_to_montg(&imag(a_y), &imag(a_y));
-        // check read point is on curve
-        if (!E2_affine_on_curve(a)) { 
-            return POINT_NOT_ON_CURVE;
-        }
-        return VALID;
+    E2_set_infty(a);
+    return VALID;
+  }
+
+  // read the sign bit and check for consistency
+  int y_sign = (bin[0] >> 5) & 1;
+  if (y_sign && (!compressed)) {
+    return BAD_ENCODING;
+  }
+
+  // use a temporary buffer to mask the header bits and read a.x
+  byte temp[Fp2_BYTES];
+  memcpy(temp, bin, Fp2_BYTES);
+  temp[0] &= 0x1F; // clear the header bits
+  ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
+  if (ret != VALID) {
+    return ret;
+  }
+  Fp2 *a_x = &(a->x);
+  Fp_to_montg(&real(a_x), &real(a_x));
+  Fp_to_montg(&imag(a_x), &imag(a_x));
+
+  // set a.z to 1
+  Fp2 *a_z = &(a->z);
+  Fp_copy(&real(a_z), &BLS12_381_pR);
+  Fp_set_zero(&imag(a_z));
+
+  Fp2 *a_y = &(a->y);
+  if (G2_SERIALIZATION == UNCOMPRESSED) {
+    ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y));
+    if (ret != VALID) {
+      return ret;
     }
-    
-    // compute the possible square root
-    Fp2_squ_montg(a_y, a_x);
-    Fp2_mul_montg(a_y, a_y, a_x);     // x^3
-    Fp2_add(a_y, a_y, &B_E2);         // B_E2 is already in Montg form             
-    if (!Fp2_sqrt_montg(a_y, a_y))    // check whether x^3+b is a quadratic residue
-        return POINT_NOT_ON_CURVE; 
-
-    // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
-    if (Fp2_get_sign(a_y) != y_sign) {
-        Fp2_neg(a_y, a_y); // flip y sign if needed
+    Fp_to_montg(&real(a_y), &real(a_y));
+    Fp_to_montg(&imag(a_y), &imag(a_y));
+    // check read point is on curve
+    if (!E2_affine_on_curve(a)) {
+      return POINT_NOT_ON_CURVE;
     }
     return VALID;
-}
-
-// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form.
-// It assumes buffer is of length G2_SER_BYTES
-// The serialization follows:
-// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) 
-void E2_write_bytes(byte *bin, const E2* a) {
-    if (E2_is_infty(a)) {
-            // set the infinity bit
-            bin[0] = (G2_SERIALIZATION << 7) | (1 << 6);
-            memset(bin+1, 0, G2_SER_BYTES-1);
-            return;
-    }
-    E2 tmp;
-    E2_to_affine(&tmp, a);
+  }
 
-    Fp2* t_x = &(tmp.x);
-    Fp_from_montg(&real(t_x), &real(t_x));
-    Fp_from_montg(&imag(t_x), &imag(t_x));
-    Fp2_write_bytes(bin, t_x);
+  // compute the possible square root
+  Fp2_squ_montg(a_y, a_x);
+  Fp2_mul_montg(a_y, a_y, a_x);  // x^3
+  Fp2_add(a_y, a_y, &B_E2);      // B_E2 is already in Montg form
+  if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue
+    return POINT_NOT_ON_CURVE;
 
-    Fp2* t_y = &(tmp.y);
-    if (G2_SERIALIZATION == COMPRESSED) {
-        bin[0] |= (Fp2_get_sign(t_y) << 5);
-    } else {
-        Fp_from_montg(&real(t_y), &real(t_y));
-        Fp_from_montg(&imag(t_y), &imag(t_y));
-        Fp2_write_bytes(bin + Fp2_BYTES, t_y);
-    }
+  // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
+  if (Fp2_get_sign(a_y) != y_sign) {
+    Fp2_neg(a_y, a_y); // flip y sign if needed
+  }
+  return VALID;
+}
 
-    bin[0] |= (G2_SERIALIZATION << 7);
+// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or
+// uncompressed form. It assumes buffer is of length G2_SER_BYTES The
+// serialization follows:
+// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
+void E2_write_bytes(byte *bin, const E2 *a) {
+  if (E2_is_infty(a)) {
+    // set the infinity bit
+    bin[0] = (G2_SERIALIZATION << 7) | (1 << 6);
+    memset(bin + 1, 0, G2_SER_BYTES - 1);
+    return;
+  }
+  E2 tmp;
+  E2_to_affine(&tmp, a);
+
+  Fp2 *t_x = &(tmp.x);
+  Fp_from_montg(&real(t_x), &real(t_x));
+  Fp_from_montg(&imag(t_x), &imag(t_x));
+  Fp2_write_bytes(bin, t_x);
+
+  Fp2 *t_y = &(tmp.y);
+  if (G2_SERIALIZATION == COMPRESSED) {
+    bin[0] |= (Fp2_get_sign(t_y) << 5);
+  } else {
+    Fp_from_montg(&real(t_y), &real(t_y));
+    Fp_from_montg(&imag(t_y), &imag(t_y));
+    Fp2_write_bytes(bin + Fp2_BYTES, t_y);
+  }
+
+  bin[0] |= (G2_SERIALIZATION << 7);
 }
 
 // set p to infinity
-void E2_set_infty(E2* p) {
-    // BLST infinity points are defined by Z=0
-    vec_zero(p->z, sizeof(p->z));  
+void E2_set_infty(E2 *p) {
+  // BLST infinity points are defined by Z=0
+  vec_zero(p->z, sizeof(p->z));
 }
 
 // check if `p` is infinity
-bool E2_is_infty(const E2* p) {
-    // BLST infinity points are defined by Z=0
-    return vec_is_zero(p->z, sizeof(p->z));
+bool E2_is_infty(const E2 *p) {
+  // BLST infinity points are defined by Z=0
+  return vec_is_zero(p->z, sizeof(p->z));
 }
 
 // checks affine point `p` is in E2
-bool E2_affine_on_curve(const E2* p) {
-    // BLST's `POINTonE2_affine_on_curve` does not include the infinity case!
-    return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p);
+bool E2_affine_on_curve(const E2 *p) {
+  // BLST's `POINTonE2_affine_on_curve` does not include the infinity case!
+  return POINTonE2_affine_on_curve((POINTonE2_affine *)p) | E2_is_infty(p);
 }
 
 // checks p1 == p2
-bool E2_is_equal(const E2* p1, const E2* p2) {
-    // `POINTonE2_is_equal` includes the infinity case
-    return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2);
+bool E2_is_equal(const E2 *p1, const E2 *p2) {
+  // `POINTonE2_is_equal` includes the infinity case
+  return POINTonE2_is_equal((const POINTonE2 *)p1, (const POINTonE2 *)p2);
 }
 
 // res = p
-void  E2_copy(E2* res, const E2* p) {
-    if ((uptr_t)p==(uptr_t)res) {
-        return;
-    }
-    vec_copy(res, p, sizeof(E2));
+void E2_copy(E2 *res, const E2 *p) {
+  if ((uptr_t)p == (uptr_t)res) {
+    return;
+  }
+  vec_copy(res, p, sizeof(E2));
 }
 
 // converts an E2 point from Jacobian into affine coordinates (z=1)
-void E2_to_affine(E2* res, const E2* p) {
-    // optimize in case coordinates are already affine
-    if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) {
-        E2_copy(res, p);
-        return;
-    }
-    // convert from Jacobian
-    POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p);   
+void E2_to_affine(E2 *res, const E2 *p) {
+  // optimize in case coordinates are already affine
+  if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) {
+    E2_copy(res, p);
+    return;
+  }
+  // convert from Jacobian
+  POINTonE2_from_Jacobian((POINTonE2 *)res, (const POINTonE2 *)p);
 }
 
 // generic point addition that must handle doubling and points at infinity
-void E2_add(E2* res, const E2* a, const E2* b) {
-    POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); 
+void E2_add(E2 *res, const E2 *a, const E2 *b) {
+  POINTonE2_dadd((POINTonE2 *)res, (POINTonE2 *)a, (POINTonE2 *)b, NULL);
 }
 
 // generic point double that must handle point at infinity
-void E2_double(E2* res, const E2* a) {
-    POINTonE2_double((POINTonE2*)res, (POINTonE2*)a); 
+void E2_double(E2 *res, const E2 *a) {
+  POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a);
 }
 
 // Point negation: res = -a
-void E2_neg(E2* res, const E2* a) {
-    E2_copy(res, a);
-    POINTonE2_cneg((POINTonE2*)res, 1);
+void E2_neg(E2 *res, const E2 *a) {
+  E2_copy(res, a);
+  POINTonE2_cneg((POINTonE2 *)res, 1);
 }
 
 // Exponentiation of a generic point `a` in E2, res = expo.a
-void E2_mult(E2* res, const E2* p, const Fr* expo) {
-    pow256 tmp;
-    pow256_from_Fr(tmp, expo);
-    POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, tmp);
-    vec_zero(&tmp, sizeof(tmp)); 
+void E2_mult(E2 *res, const E2 *p, const Fr *expo) {
+  pow256 tmp;
+  pow256_from_Fr(tmp, expo);
+  POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)p, tmp);
+  vec_zero(&tmp, sizeof(tmp));
 }
 
 // Exponentiation of a generic point `a` in E2 by a byte exponent,
 // using a classic double-and-add algorithm (non constant-time)
-void  E2_mult_small_expo(E2* res, const E2* p, const byte expo) {
-    // return early if expo is zero
-    if (expo == 0) {
-        E2_set_infty(res);
-        return;
-    }
-    // expo is non zero
-
-	byte mask = 1<<7;
-	// process the most significant zero bits
-	while((expo & mask) == 0) {
-		mask >>= 1;
-    }
-
-	// process the first `1` bit
-    E2 tmp;
-	E2_copy(&tmp, p);
-	mask >>= 1;
-    // scan the remaining bits
-    for ( ; mask != 0 ; mask >>= 1 ) {
-        E2_double(&tmp, &tmp);
-        if (expo & mask) {
-            E2_add(&tmp, &tmp, p);  
-        }
+void E2_mult_small_expo(E2 *res, const E2 *p, const byte expo) {
+  // return early if expo is zero
+  if (expo == 0) {
+    E2_set_infty(res);
+    return;
+  }
+  // expo is non zero
+
+  byte mask = 1 << 7;
+  // process the most significant zero bits
+  while ((expo & mask) == 0) {
+    mask >>= 1;
+  }
+
+  // process the first `1` bit
+  E2 tmp;
+  E2_copy(&tmp, p);
+  mask >>= 1;
+  // scan the remaining bits
+  for (; mask != 0; mask >>= 1) {
+    E2_double(&tmp, &tmp);
+    if (expo & mask) {
+      E2_add(&tmp, &tmp, p);
     }
-    E2_copy(res, &tmp);
+  }
+  E2_copy(res, &tmp);
 }
 
 // Exponentiation of generator g2 of G2, res = expo.g2
-void G2_mult_gen(E2* res, const Fr* expo) {
-    pow256 tmp;
-    pow256_from_Fr(tmp, expo);
-    POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)BLS12_381_g2, tmp);
-    vec_zero(&tmp, sizeof(tmp));
+void G2_mult_gen(E2 *res, const Fr *expo) {
+  pow256 tmp;
+  pow256_from_Fr(tmp, expo);
+  POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)BLS12_381_g2, tmp);
+  vec_zero(&tmp, sizeof(tmp));
 }
 
 // checks if input E2 point is on the subgroup G2.
 // It assumes input `p` is on E2.
-bool E2_in_G2(const E2* p){
-    // currently uses Scott method
-    return POINTonE2_in_G2((const POINTonE2*)p);
+bool E2_in_G2(const E2 *p) {
+  // currently uses Scott method
+  return POINTonE2_in_G2((const POINTonE2 *)p);
 }
 
 // computes the sum of the E2 array elements `y[i]` and writes it in `sum`
-void E2_sum_vector(E2* sum, const E2* y, const int len){
-    E2_set_infty(sum);
-    for (int i=0; i<len; i++){
-        E2_add(sum, sum, &y[i]);
-    }
+void E2_sum_vector(E2 *sum, const E2 *y, const int len) {
+  E2_set_infty(sum);
+  for (int i = 0; i < len; i++) {
+    E2_add(sum, sum, &y[i]);
+  }
 }
 
-// Subtracts all G2 array elements `y` from an element `x` and writes the 
+// Subtracts all G2 array elements `y` from an element `x` and writes the
 // result in res
-void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){
-    E2_sum_vector(res, y, len);
-    E2_neg(res, res);
-    E2_add(res, x, res);
+void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len) {
+  E2_sum_vector(res, y, len);
+  E2_neg(res, res);
+  E2_add(res, x, res);
 }
 
 // maps the bytes to a point in G2.
 // `len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
-void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) {
-    assert(len >= Fr_BYTES);
-    // map to Fr
-    Fr log;
-    map_bytes_to_Fr(&log, bytes, len);
-    // multiplies G2 generator by a random scalar
-    G2_mult_gen(p, &log);
+void unsafe_map_bytes_to_G2(E2 *p, const byte *bytes, int len) {
+  assert(len >= Fr_BYTES);
+  // map to Fr
+  Fr log;
+  map_bytes_to_Fr(&log, bytes, len);
+  // multiplies G2 generator by a random scalar
+  G2_mult_gen(p, &log);
 }
 
 // maps `bytes` to a point in E2\G2 and stores it in p.
-// `len` should be at least 192. 
+// `len` should be at least 192.
 // this is a testing tool only, it should not be used in any protocol!
-void unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) {
-    assert(len >= 192);
-    Fp2 u;
-    map_96_bytes_to_Fp(&real(&u), bytes, 96);
-    map_96_bytes_to_Fp(&imag(&u), bytes+96, 96);
-    // map to E2's isogenous and then to E2
-    map_to_isogenous_E2((POINTonE2 *)p, u);
-    isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p);
-    // clear G2 order
-    E2_mult(p, p, (Fr*)&BLS12_381_r);
+void unsafe_map_bytes_to_G2complement(E2 *p, const byte *bytes, int len) {
+  assert(len >= 192);
+  Fp2 u;
+  map_96_bytes_to_Fp(&real(&u), bytes, 96);
+  map_96_bytes_to_Fp(&imag(&u), bytes + 96, 96);
+  // map to E2's isogenous and then to E2
+  map_to_isogenous_E2((POINTonE2 *)p, u);
+  isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p);
+  // clear G2 order
+  E2_mult(p, p, (Fr *)&BLS12_381_r);
 }
 
-// ------------------- Pairing utilities 
+// ------------------- Pairing utilities
 
 bool Fp12_is_one(Fp12 *a) {
-    return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12));
+  return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12));
 }
 
-void Fp12_set_one(Fp12 *a) {
-    vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12));
-}
+void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); }
 
-// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) 
+// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1])
 // by optimizing a common final exponentiation for all pairings.
 // result is stored in `res`.
-// It assumes `p` and `q` are correctly initialized and all 
+// It assumes `p` and `q` are correctly initialized and all
 // p[i] and q[i] are respectively on G1 and G2 (it does not
 // check their memberships).
-void Fp12_multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) {
-    // easier access pointer
-    vec384fp6* res_vec = (vec384fp6*)res;
-    // N_MAX is defined within BLST. It should represent a good tradeoff of the max number
-    // of miller loops to be batched in one call to `miller_loop_n`.
-    // miller_loop_n expects an array of `POINTonEx_affine`.
-    POINTonE1_affine p_aff[N_MAX];
-    POINTonE2_affine q_aff[N_MAX];
-    int n = 0; // the number of couples (p,q) held in p_aff and q_aff
-    int init_flag = 0;
-
-    for (int i=0; i<len; i++) {
-        if (E1_is_infty(p + i) || E2_is_infty(q + i)) {
-            continue;
-        }
-        // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array.
-        // `POINTonEx_affine` has a different size than `POINTonEx` or `Ex` ! 
-        E1 tmp1; 
-        E1_to_affine(&tmp1, p + i); 
-        vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine));
-        E2 tmp2; 
-        E2_to_affine(&tmp2, q + i); 
-        vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine));
-        n++;
-        if (n==N_MAX) {  // if p_ and q_ are filled, batch `N_MAX` miller loops
-            if (!init_flag) {
-                miller_loop_n(res_vec, q_aff, p_aff, N_MAX); 
-                init_flag = 1;
-            } else {
-                vec384fp12 tmp;
-                miller_loop_n(tmp, q_aff, p_aff, N_MAX);
-                mul_fp12(res_vec, res_vec, tmp);
-            }
-            n = 0;
-        }
+void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
+  // easier access pointer
+  vec384fp6 *res_vec = (vec384fp6 *)res;
+  // N_MAX is defined within BLST. It should represent a good tradeoff of the
+  // max number of miller loops to be batched in one call to `miller_loop_n`.
+  // miller_loop_n expects an array of `POINTonEx_affine`.
+  POINTonE1_affine p_aff[N_MAX];
+  POINTonE2_affine q_aff[N_MAX];
+  int n = 0; // the number of couples (p,q) held in p_aff and q_aff
+  int init_flag = 0;
+
+  for (int i = 0; i < len; i++) {
+    if (E1_is_infty(p + i) || E2_is_infty(q + i)) {
+      continue;
+    }
+    // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array.
+    // `POINTonEx_affine` has a different size than `POINTonEx` or `Ex` !
+    E1 tmp1;
+    E1_to_affine(&tmp1, p + i);
+    vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine));
+    E2 tmp2;
+    E2_to_affine(&tmp2, q + i);
+    vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine));
+    n++;
+    if (n == N_MAX) { // if p_ and q_ are filled, batch `N_MAX` miller loops
+      if (!init_flag) {
+        miller_loop_n(res_vec, q_aff, p_aff, N_MAX);
+        init_flag = 1;
+      } else {
+        vec384fp12 tmp;
+        miller_loop_n(tmp, q_aff, p_aff, N_MAX);
+        mul_fp12(res_vec, res_vec, tmp);
+      }
+      n = 0;
     }
-    // if p_ and q_ aren't empty,
-    // remaining couples are also batched in `n` miller loops
-    if (n > 0) {
-        if (!init_flag) {
-            miller_loop_n(res_vec, q_aff, p_aff, n);
-            init_flag = 1;
-        } else {
-            vec384fp12 tmp;
-            miller_loop_n(tmp, q_aff, p_aff, n);
-            mul_fp12(res_vec, res_vec, tmp);
-        } 
-    }   
-
-    // check if no miller loop was computed
+  }
+  // if p_ and q_ aren't empty,
+  // remaining couples are also batched in `n` miller loops
+  if (n > 0) {
     if (!init_flag) {
-        Fp12_set_one(res);
+      miller_loop_n(res_vec, q_aff, p_aff, n);
+      init_flag = 1;
+    } else {
+      vec384fp12 tmp;
+      miller_loop_n(tmp, q_aff, p_aff, n);
+      mul_fp12(res_vec, res_vec, tmp);
     }
-    final_exp(res_vec, res_vec);
+  }
+
+  // check if no miller loop was computed
+  if (!init_flag) {
+    Fp12_set_one(res);
+  }
+  final_exp(res_vec, res_vec);
 }
 
 // This is a testing function and is not used in exported functions
 // It uses an expand message XMD based on SHA2-256.
-void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){
-    expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst);
+void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst,
+                int len_dst) {
+  expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst);
 }
 
-
-// DEBUG printing functions 
+// DEBUG printing functions
 #if (DEBUG == 1)
-void bytes_print_(char* s, byte* data, int len) {
-    if (strlen(s)) printf("[%s]:\n", s);
-    for (int i=0; i<len; i++) 
-        printf("%02X,", data[i]);
-    printf("\n");
-}
-
-void Fr_print_(char* s, Fr* a) {
-    if (strlen(s)) printf("[%s]:\n", s);
-    limb_t* p = (limb_t*)(a) + Fr_LIMBS;
-    for (int i=0; i<Fr_LIMBS; i++) 
-        printf("%016llX", *(--p));
-    printf("\n");
-}
-
-void Fp_print_(char* s, const Fp* a) {
-    if (strlen(s)) printf("[%s]:\n", s);
-    Fp tmp;
-    Fp_from_montg(&tmp, a);
-    limb_t* p = (limb_t*)(&tmp) + Fp_LIMBS;
-    for (int i=0; i<Fp_LIMBS; i++) 
-        printf("%016llX ", *(--p));
-    printf("\n");
-}
-
-void Fp2_print_(char* s, const Fp2* a) {
-    if (strlen(s)) printf("[%s]:\n", s);
-    Fp_print_("", &real(a));
-    Fp_print_("", &imag(a));
-}
-
-void Fp12_print_(char* s, const Fp12* a) {
-    if (strlen(s)) printf("[%s]:\n", s);
-    for (int i=0; i<2; i++) {
-        vec384fp6* a_ = (vec384fp6*)a + i;
-        for (int j=0; j<3; j++) {
-            vec384fp2* a__ = (vec384fp2*)a_ + j;
-            Fp2_print_("", a__);
-        }
+void bytes_print_(char *s, byte *data, int len) {
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  for (int i = 0; i < len; i++)
+    printf("%02X,", data[i]);
+  printf("\n");
+}
+
+void Fr_print_(char *s, Fr *a) {
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  limb_t *p = (limb_t *)(a) + Fr_LIMBS;
+  for (int i = 0; i < Fr_LIMBS; i++)
+    printf("%016llX", *(--p));
+  printf("\n");
+}
+
+void Fp_print_(char *s, const Fp *a) {
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  Fp tmp;
+  Fp_from_montg(&tmp, a);
+  limb_t *p = (limb_t *)(&tmp) + Fp_LIMBS;
+  for (int i = 0; i < Fp_LIMBS; i++)
+    printf("%016llX ", *(--p));
+  printf("\n");
+}
+
+void Fp2_print_(char *s, const Fp2 *a) {
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  Fp_print_("", &real(a));
+  Fp_print_("", &imag(a));
+}
+
+void Fp12_print_(char *s, const Fp12 *a) {
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  for (int i = 0; i < 2; i++) {
+    vec384fp6 *a_ = (vec384fp6 *)a + i;
+    for (int j = 0; j < 3; j++) {
+      vec384fp2 *a__ = (vec384fp2 *)a_ + j;
+      Fp2_print_("", a__);
     }
-}
-
-void E1_print_(char* s, const E1* p, const int jacob) {
-    E1 a; E1_copy(&a, p);
-    if (!jacob) E1_to_affine(&a, &a);
-    if (strlen(s))  printf("[%s]:\n", s);
-    Fp_print_(".x", &(a.x));
-    Fp_print_(".y", &(a.y));
-    if (jacob) Fp_print_(".z", &(a.z));
-}
-
-void E2_print_(char* s, const E2* p, const int jacob) {
-    E2 a; E2_copy(&a, p);
-    if (!jacob) E2_to_affine(&a, &a);
-    if (strlen(s)) printf("[%s]:\n", s);
-    Fp2_print_("", &(a.x));
-    Fp2_print_("", &(a.y));
-    if (jacob) Fp2_print_("", &(a.z));
+  }
+}
+
+void E1_print_(char *s, const E1 *p, const int jacob) {
+  E1 a;
+  E1_copy(&a, p);
+  if (!jacob)
+    E1_to_affine(&a, &a);
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  Fp_print_(".x", &(a.x));
+  Fp_print_(".y", &(a.y));
+  if (jacob)
+    Fp_print_(".z", &(a.z));
+}
+
+void E2_print_(char *s, const E2 *p, const int jacob) {
+  E2 a;
+  E2_copy(&a, p);
+  if (!jacob)
+    E2_to_affine(&a, &a);
+  if (strlen(s))
+    printf("[%s]:\n", s);
+  Fp2_print_("", &(a.x));
+  Fp2_print_("", &(a.y));
+  if (jacob)
+    Fp2_print_("", &(a.z));
 }
 
 #endif
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index ae899877be2..b2ea2654228 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -1,130 +1,132 @@
 // this file contains utility functions for the curve BLS 12-381
-// these tools are shared by the BLS signature scheme, the BLS based threshold signature
-// and the BLS distributed key generation protocols
+// these tools are shared by the BLS signature scheme, the BLS based threshold
+// signature and the BLS distributed key generation protocols
 
 #ifndef _BLS12_381_UTILS_H
 #define _BLS12_381_UTILS_H
 
-#include <string.h>
-#include <stdint.h>
 #include "blst_include.h"
+#include <stdint.h>
+#include <string.h>
 
 typedef uint8_t byte;
-typedef _Bool bool;  // assuming cgo is using a modern enough compiler
+typedef _Bool bool; // assuming cgo is using a modern enough compiler
 
 // minimum targeted security level
-#define SEC_BITS  128
+#define SEC_BITS 128
 
 typedef enum {
-    VALID = 0,
-    INVALID,
-    BAD_ENCODING,
-    BAD_VALUE,
-    POINT_NOT_ON_CURVE,
-    POINT_NOT_IN_GROUP,
-    UNDEFINED,
+  VALID = 0,
+  INVALID,
+  BAD_ENCODING,
+  BAD_VALUE,
+  POINT_NOT_ON_CURVE,
+  POINT_NOT_IN_GROUP,
+  UNDEFINED,
 } ERROR;
 
-#define BITS_TO_BYTES(x) ((x+7)>>3)
-#define BITS_TO_LIMBS(x) ((x+63)>>6)
-#define BYTES_TO_LIMBS(x) ((x+7)>>3)
-#define LIMBS_TO_BYTES(x) ((x)<<3)
-#define MIN(a,b) ((a)>(b)?(b):(a))
+#define BITS_TO_BYTES(x) ((x + 7) >> 3)
+#define BITS_TO_LIMBS(x) ((x + 63) >> 6)
+#define BYTES_TO_LIMBS(x) ((x + 7) >> 3)
+#define LIMBS_TO_BYTES(x) ((x) << 3)
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
 
 // Fields and Group serialization lengths
-#define Fp_BITS   381
-#define Fp2_BYTES (2*Fp_BYTES)
-#define Fp_LIMBS  BITS_TO_LIMBS(Fp_BITS)
-#define Fp_BYTES  LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array
-#define Fr_BITS   255
-#define Fr_LIMBS  BITS_TO_LIMBS(Fr_BITS)
-#define Fr_BYTES  LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array
+#define Fp_BITS 381
+#define Fp2_BYTES (2 * Fp_BYTES)
+#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS)
+#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array
+#define Fr_BITS 255
+#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS)
+#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array
 
-#define G1_BYTES (2*Fp_BYTES)
-#define G2_BYTES (2*Fp2_BYTES)
+#define G1_BYTES (2 * Fp_BYTES)
+#define G2_BYTES (2 * Fp2_BYTES)
 
 // Compressed and uncompressed points
-#define COMPRESSED          1
-#define UNCOMPRESSED        0
-#define G1_SERIALIZATION    (COMPRESSED)
-#define G2_SERIALIZATION    (COMPRESSED)
-#define G1_SER_BYTES        (G1_BYTES/(G1_SERIALIZATION+1))
-#define G2_SER_BYTES        (G2_BYTES/(G2_SERIALIZATION+1))
+#define COMPRESSED 1
+#define UNCOMPRESSED 0
+#define G1_SERIALIZATION (COMPRESSED)
+#define G2_SERIALIZATION (COMPRESSED)
+#define G1_SER_BYTES (G1_BYTES / (G1_SERIALIZATION + 1))
+#define G2_SER_BYTES (G2_BYTES / (G2_SERIALIZATION + 1))
 
 // Fr utilities
 extern const Fr BLS12_381_rR;
-bool        Fr_is_zero(const Fr* a);
-bool        Fr_is_equal(const Fr* a, const Fr* b);
-void        Fr_set_limb(Fr*, const limb_t);
-void        Fr_copy(Fr*, const Fr*);
-void        Fr_set_zero(Fr*);
-void        Fr_add(Fr *res, const Fr *a, const Fr *b);
-void        Fr_sub(Fr *res, const Fr *a, const Fr *b);
-void        Fr_neg(Fr *res, const Fr *a);
-void        Fr_sum_vector(Fr*, const Fr x[], const int);
-void        Fr_mul_montg(Fr *res, const Fr *a, const Fr *b);
-void        Fr_squ_montg(Fr *res, const Fr *a);
-void        Fr_to_montg(Fr *res, const Fr *a);
-void        Fr_from_montg(Fr *res, const Fr *a);
-void        Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len);
-void        Fr_inv_montg_eucl(Fr *res, const Fr *a);
-void        Fr_inv_exp_montg(Fr *res, const Fr *a);
-ERROR       Fr_read_bytes(Fr* a, const byte *bin, int len);
-ERROR       Fr_star_read_bytes(Fr* a, const byte *bin, int len);
-void        Fr_write_bytes(byte *bin, const Fr* a);
-bool        map_bytes_to_Fr(Fr*, const byte*, int);
+bool Fr_is_zero(const Fr *a);
+bool Fr_is_equal(const Fr *a, const Fr *b);
+void Fr_set_limb(Fr *, const limb_t);
+void Fr_copy(Fr *, const Fr *);
+void Fr_set_zero(Fr *);
+void Fr_add(Fr *res, const Fr *a, const Fr *b);
+void Fr_sub(Fr *res, const Fr *a, const Fr *b);
+void Fr_neg(Fr *res, const Fr *a);
+void Fr_sum_vector(Fr *, const Fr x[], const int);
+void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b);
+void Fr_squ_montg(Fr *res, const Fr *a);
+void Fr_to_montg(Fr *res, const Fr *a);
+void Fr_from_montg(Fr *res, const Fr *a);
+void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo,
+                  const int expo_len);
+void Fr_inv_montg_eucl(Fr *res, const Fr *a);
+void Fr_inv_exp_montg(Fr *res, const Fr *a);
+ERROR Fr_read_bytes(Fr *a, const byte *bin, int len);
+ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len);
+void Fr_write_bytes(byte *bin, const Fr *a);
+bool map_bytes_to_Fr(Fr *, const byte *, int);
 
 // Fp utilities
-void        Fp_mul_montg(Fp *, const Fp *, const Fp *);
-void        Fp_squ_montg(Fp *, const Fp *);
+void Fp_mul_montg(Fp *, const Fp *, const Fp *);
+void Fp_squ_montg(Fp *, const Fp *);
 
 // E1 and G1 utilities
-void        E1_copy(E1*, const E1*);
-bool        E1_is_equal(const E1*, const E1*);
-void        E1_set_infty(E1*);
-bool        E1_is_infty(const E1*);
-void        E1_to_affine(E1*, const E1*);
-bool        E1_affine_on_curve(const E1*);
-bool        E1_in_G1(const E1*);
-void        E1_mult(E1*, const E1*, const Fr*);
-void        E1_add(E1*, const E1*, const E1*);
-void        E1_neg(E1*, const E1*);
-void        E1_sum_vector(E1*, const E1*, const int);
-int         E1_sum_vector_byte(byte*, const byte*, const int);
-void        G1_mult_gen(E1*, const Fr*);
-ERROR       E1_read_bytes(E1*, const byte *,  const int); 
-void        E1_write_bytes(byte *, const E1*);
-void        unsafe_map_bytes_to_G1(E1*, const byte*, int);
-void        unsafe_map_bytes_to_G1complement(E1*, const byte*, int);
-
-#define     MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8))
-int         map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c
+void E1_copy(E1 *, const E1 *);
+bool E1_is_equal(const E1 *, const E1 *);
+void E1_set_infty(E1 *);
+bool E1_is_infty(const E1 *);
+void E1_to_affine(E1 *, const E1 *);
+bool E1_affine_on_curve(const E1 *);
+bool E1_in_G1(const E1 *);
+void E1_mult(E1 *, const E1 *, const Fr *);
+void E1_add(E1 *, const E1 *, const E1 *);
+void E1_neg(E1 *, const E1 *);
+void E1_sum_vector(E1 *, const E1 *, const int);
+int E1_sum_vector_byte(byte *, const byte *, const int);
+void G1_mult_gen(E1 *, const Fr *);
+ERROR E1_read_bytes(E1 *, const byte *, const int);
+void E1_write_bytes(byte *, const E1 *);
+void unsafe_map_bytes_to_G1(E1 *, const byte *, int);
+void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int);
+
+#define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8))
+int map_to_G1(E1 *, const byte *,
+              const int); // functions in bls12381_hashtocurve.c
 
 // E2 and G2 utilities
-void        E2_set_infty(E2* p);
-bool        E2_is_infty(const E2*);
-bool        E2_affine_on_curve(const E2*);
-bool        E2_is_equal(const E2*, const E2*);
-void        E2_copy(E2*, const E2*);
-void        E2_to_affine(E2*, const E2*);
-ERROR       E2_read_bytes(E2*, const byte *,  const int); 
-void        E2_write_bytes(byte *, const E2*);
-void        G2_mult_gen(E2*, const Fr*);
-void        E2_mult(E2*, const E2*, const Fr*);
-void        E2_mult_small_expo(E2*, const E2*, const byte);
-void        E2_add(E2* res, const E2* a, const E2* b);
-void        E2_double(E2* res, const E2* a);
-void        E2_neg(E2*, const E2*);
-void        E2_sum_vector(E2*, const E2*, const int);
-void        E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len);
-bool        E2_in_G2(const E2*);
-void        unsafe_map_bytes_to_G2(E2*, const byte*, int);
-void        unsafe_map_bytes_to_G2complement(E2*, const byte*, int);
+void E2_set_infty(E2 *p);
+bool E2_is_infty(const E2 *);
+bool E2_affine_on_curve(const E2 *);
+bool E2_is_equal(const E2 *, const E2 *);
+void E2_copy(E2 *, const E2 *);
+void E2_to_affine(E2 *, const E2 *);
+ERROR E2_read_bytes(E2 *, const byte *, const int);
+void E2_write_bytes(byte *, const E2 *);
+void G2_mult_gen(E2 *, const Fr *);
+void E2_mult(E2 *, const E2 *, const Fr *);
+void E2_mult_small_expo(E2 *, const E2 *, const byte);
+void E2_add(E2 *res, const E2 *a, const E2 *b);
+void E2_double(E2 *res, const E2 *a);
+void E2_neg(E2 *, const E2 *);
+void E2_sum_vector(E2 *, const E2 *, const int);
+void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len);
+bool E2_in_G2(const E2 *);
+void unsafe_map_bytes_to_G2(E2 *, const byte *, int);
+void unsafe_map_bytes_to_G2complement(E2 *, const byte *, int);
 
 // pairing and Fp12
-bool        Fp12_is_one(Fp12*);
-void        Fp12_set_one(Fp12*);
-void        Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int);
+bool Fp12_is_one(Fp12 *);
+void Fp12_set_one(Fp12 *);
+void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int);
 
 // utility testing function
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
@@ -133,13 +135,13 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int);
 #define DEBUG 1
 #if (DEBUG == 1)
 #include <stdio.h>
-void     bytes_print_(char*, byte*, int);
-void     Fr_print_(char*, Fr*);
-void     Fp_print_(char*, const Fp*);
-void     Fp2_print_(char*, const Fp2*);
-void     Fp12_print_(char*, const Fp12*);
-void     E1_print_(char*, const E1*, const int);
-void     E2_print_(char*, const E2*, const int);
+void bytes_print_(char *, byte *, int);
+void Fr_print_(char *, Fr *);
+void Fp_print_(char *, const Fp *);
+void Fp2_print_(char *, const Fp2 *);
+void Fp12_print_(char *, const Fp12 *);
+void E1_print_(char *, const E1 *, const int);
+void E2_print_(char *, const E2 *, const int);
 #endif /* DEBUG */
 
 #endif /* BLS12_381_UTILS */
\ No newline at end of file
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 39b9e243fd1..942002de747 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -6,461 +6,498 @@
 
 // Computes a BLS signature from a G1 point and writes it in `out`.
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
-static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) {
-    // s = h^s
-    E1 s;
-    E1_mult(&s, h, sk);
-    E1_write_bytes(out, &s);
+static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) {
+  // s = h^s
+  E1 s;
+  E1_mult(&s, h, sk);
+  E1_write_bytes(out, &s);
 }
 
 // Computes a BLS signature from a hash and writes it in `out`.
-// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. 
-// `out` must be allocated properly with `G1_SER_BYTES` bytes.
-int bls_sign(byte* out, const Fr* sk, const byte* hash, const int hash_len) {
-    // hash to G1
-    E1 h;
-    if (map_to_G1(&h, hash, hash_len) != VALID) {
-        return INVALID;
-    }
-    // s = h^sk
-    bls_sign_E1(out, sk, &h);
-    return VALID;
+// `hash` represents the hashed message with length `hash_len` equal to
+// `MAP_TO_G1_INPUT_LEN`. `out` must be allocated properly with `G1_SER_BYTES`
+// bytes.
+int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) {
+  // hash to G1
+  E1 h;
+  if (map_to_G1(&h, hash, hash_len) != VALID) {
+    return INVALID;
+  }
+  // s = h^sk
+  bls_sign_E1(out, sk, &h);
+  return VALID;
 }
 
-extern const E2* BLS12_381_minus_g2;
+extern const E2 *BLS12_381_minus_g2;
 
 // Verifies a BLS signature (G1 point) against a public key (G2 point)
 // and a message hash `h` (G1 point).
-// Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This 
-// function only checks the pairing equality. 
-static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) {  
-    E1 elemsG1[2];
-    E2 elemsG2[2];
-
-    // elemsG1[0] = s, elemsG1[1] = h
-    E1_copy(&elemsG1[0], s);
-    E1_copy(&elemsG1[1], h);
-
-    // elemsG2[0] = -g2, elemsG2[1] = pk
-    E2_copy(&elemsG2[0], BLS12_381_minus_g2);
-    E2_copy(&elemsG2[1], pk);
-
-    // double pairing
-    Fp12 e;
-    Fp12_multi_pairing(&e, elemsG1, elemsG2, 2);
-    if (Fp12_is_one(&e)) {
-        return VALID;
-    }
-    return INVALID;
+// Hash, signature and public key are assumed to be in G1, G1 and G2
+// respectively. This function only checks the pairing equality.
+static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) {
+  E1 elemsG1[2];
+  E2 elemsG2[2];
+
+  // elemsG1[0] = s, elemsG1[1] = h
+  E1_copy(&elemsG1[0], s);
+  E1_copy(&elemsG1[1], h);
+
+  // elemsG2[0] = -g2, elemsG2[1] = pk
+  E2_copy(&elemsG2[0], BLS12_381_minus_g2);
+  E2_copy(&elemsG2[1], pk);
+
+  // double pairing
+  Fp12 e;
+  Fp12_multi_pairing(&e, elemsG1, elemsG2, 2);
+  if (Fp12_is_one(&e)) {
+    return VALID;
+  }
+  return INVALID;
 }
 
-
 // Verifies the validity of an aggregated BLS signature under distinct messages.
 //
-// Each message is mapped to a set of public keys, so that the verification equation is 
-// optimized to compute one pairing per message. 
+// Each message is mapped to a set of public keys, so that the verification
+// equation is optimized to compute one pairing per message.
 // - sig is the signature.
 // - nb_hashes is the number of the messages (hashes) in the map
-// - hashes is pointer to all flattened hashes in order where the hash at index i has a byte length len_hashes[i],
-//   is mapped to pks_per_hash[i] public keys. 
+// - hashes is pointer to all flattened hashes in order where the hash at index
+// i has a byte length len_hashes[i],
+//   is mapped to pks_per_hash[i] public keys.
 // - the keys are flattened in pks in the same hashes order.
 //
 // membership check of the signature in G1 is verified in this function
 // membership check of pks in G2 is not verified in this function
-// the membership check is separated to allow optimizing multiple verifications using the same pks
-int bls_verifyPerDistinctMessage(const byte* sig, 
-                         const int nb_hashes, const byte* hashes, const uint32_t* len_hashes,
-                         const uint32_t* pks_per_hash, const E2* pks) {  
-
-    int ret = UNDEFINED; // return value
-    
-    E1* elemsG1 = (E1*)malloc((nb_hashes + 1) * sizeof(E1));
-    if (!elemsG1) goto outG1;
-    E2* elemsG2 = (E2*)malloc((nb_hashes + 1) * sizeof(E2));
-    if (!elemsG2) goto outG2;
-
-    // elemsG1[0] = sig
-    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
-        ret = INVALID;
-        goto out;
-    }
-
-    // check signature is in G1
-    if (!E1_in_G1(&elemsG1[0])) {
-        ret = INVALID;
-        goto out;
-    }
-
-    // elemsG2[0] = -g2
-    E2_copy(&elemsG2[0], BLS12_381_minus_g2); 
-
-    // map all hashes to G1
-    int offset = 0;
-    for (int i=1; i < nb_hashes+1; i++) {
-        // elemsG1[i] = h
-        // hash to G1 
-        map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i-1]);
-        offset += len_hashes[i-1];
-    }
-
-    // aggregate public keys mapping to the same hash
-    offset = 0;
-    for (int i=1; i < nb_hashes+1; i++) {
-        // elemsG2[i] = agg_pk[i]
-        E2_sum_vector(&elemsG2[i], &pks[offset] , pks_per_hash[i-1]);
-        offset += pks_per_hash[i-1];
-    }
-
-    // multi pairing
-    Fp12 e;
-    Fp12_multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1);
-    if (Fp12_is_one(&e)) {
-        ret = VALID;
-    } else {
-        ret = INVALID;
-    }
+// the membership check is separated to allow optimizing multiple verifications
+// using the same pks
+int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes,
+                                 const byte *hashes, const uint32_t *len_hashes,
+                                 const uint32_t *pks_per_hash, const E2 *pks) {
+
+  int ret = UNDEFINED; // return value
+
+  E1 *elemsG1 = (E1 *)malloc((nb_hashes + 1) * sizeof(E1));
+  if (!elemsG1)
+    goto outG1;
+  E2 *elemsG2 = (E2 *)malloc((nb_hashes + 1) * sizeof(E2));
+  if (!elemsG2)
+    goto outG2;
+
+  // elemsG1[0] = sig
+  if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
+    ret = INVALID;
+    goto out;
+  }
+
+  // check signature is in G1
+  if (!E1_in_G1(&elemsG1[0])) {
+    ret = INVALID;
+    goto out;
+  }
+
+  // elemsG2[0] = -g2
+  E2_copy(&elemsG2[0], BLS12_381_minus_g2);
+
+  // map all hashes to G1
+  int offset = 0;
+  for (int i = 1; i < nb_hashes + 1; i++) {
+    // elemsG1[i] = h
+    // hash to G1
+    map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i - 1]);
+    offset += len_hashes[i - 1];
+  }
+
+  // aggregate public keys mapping to the same hash
+  offset = 0;
+  for (int i = 1; i < nb_hashes + 1; i++) {
+    // elemsG2[i] = agg_pk[i]
+    E2_sum_vector(&elemsG2[i], &pks[offset], pks_per_hash[i - 1]);
+    offset += pks_per_hash[i - 1];
+  }
+
+  // multi pairing
+  Fp12 e;
+  Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_hashes + 1);
+  if (Fp12_is_one(&e)) {
+    ret = VALID;
+  } else {
+    ret = INVALID;
+  }
 
 out:
-    free(elemsG2);
+  free(elemsG2);
 outG2:
-    free(elemsG1);
+  free(elemsG1);
 outG1:
-    return ret;
+  return ret;
 }
 
-
-// Verifies the validity of an aggregated BLS signature under distinct public keys.
+// Verifies the validity of an aggregated BLS signature under distinct public
+// keys.
 //
-// Each key is mapped to a set of messages, so that the verification equation is 
-// optimized to compute one pairing per public key. 
+// Each key is mapped to a set of messages, so that the verification equation is
+// optimized to compute one pairing per public key.
 // - nb_pks is the number of the public keys in the map.
 // - pks is pointer to all pks in order where the key at index i
-//   is mapped to hashes_per_pk[i] hashes. 
+//   is mapped to hashes_per_pk[i] hashes.
 // - the messages (hashes) are flattened in hashes in the same public key order,
 //  each with a length in len_hashes.
 //
 // membership check of the signature in G1 is verified in this function
 // membership check of pks in G2 is not verified in this function
-// the membership check is separated to allow optimizing multiple verifications using the same pks
-int bls_verifyPerDistinctKey(const byte* sig, 
-                         const int nb_pks, const E2* pks, const uint32_t* hashes_per_pk,
-                         const byte* hashes, const uint32_t* len_hashes){
-
-    int ret = UNDEFINED; // return value
-    
-    E1* elemsG1 = (E1*)malloc((nb_pks + 1) * sizeof(E1));
-    if (!elemsG1) goto outG1;
-    E2* elemsG2 = (E2*)malloc((nb_pks + 1) * sizeof(E2));
-    if (!elemsG2) goto outG2;
-
-    // elemsG1[0] = s
-    if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
-        ret = INVALID;
-        goto out;
+// the membership check is separated to allow optimizing multiple verifications
+// using the same pks
+int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks,
+                             const uint32_t *hashes_per_pk, const byte *hashes,
+                             const uint32_t *len_hashes) {
+
+  int ret = UNDEFINED; // return value
+
+  E1 *elemsG1 = (E1 *)malloc((nb_pks + 1) * sizeof(E1));
+  if (!elemsG1)
+    goto outG1;
+  E2 *elemsG2 = (E2 *)malloc((nb_pks + 1) * sizeof(E2));
+  if (!elemsG2)
+    goto outG2;
+
+  // elemsG1[0] = s
+  if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) {
+    ret = INVALID;
+    goto out;
+  }
+
+  // check s in G1
+  if (!E1_in_G1(&elemsG1[0])) {
+    ret = INVALID;
+    goto out;
+  }
+
+  // elemsG2[0] = -g2
+  E2_copy(&elemsG2[0], BLS12_381_minus_g2);
+
+  // set the public keys
+  for (int i = 1; i < nb_pks + 1; i++) {
+    E2_copy(&elemsG2[i], &pks[i - 1]);
+  }
+
+  // map all hashes to G1 and aggregate the ones with the same public key
+
+  // tmp_hashes is a temporary array of all hashes under a same key mapped to a
+  // G1 point. tmp_hashes size is set to the maximum possible size to minimize
+  // malloc calls.
+  int tmp_hashes_size = hashes_per_pk[0];
+  for (int i = 1; i < nb_pks; i++) {
+    if (hashes_per_pk[i] > tmp_hashes_size) {
+      tmp_hashes_size = hashes_per_pk[i];
     }
-
-    // check s in G1
-    if (!E1_in_G1(&elemsG1[0])){
-        ret = INVALID;
-        goto out;
+  }
+  E1 *tmp_hashes = (E1 *)malloc(tmp_hashes_size * sizeof(E1));
+  if (!tmp_hashes) {
+    ret = UNDEFINED;
+    goto out;
+  }
+
+  // sum hashes under the same key
+  int data_offset = 0;
+  int index_offset = 0;
+  for (int i = 1; i < nb_pks + 1; i++) {
+    for (int j = 0; j < hashes_per_pk[i - 1]; j++) {
+      // map the hash to G1
+      map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]);
+      data_offset += len_hashes[index_offset];
+      index_offset++;
     }
+    // aggregate all the points of the array
+    E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i - 1]);
+  }
+  free(tmp_hashes);
 
-    // elemsG2[0] = -g2
-    E2_copy(&elemsG2[0], BLS12_381_minus_g2);
+  // multi pairing
+  Fp12 e;
+  Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks + 1);
 
-    // set the public keys
-    for (int i=1; i < nb_pks+1; i++) {
-        E2_copy(&elemsG2[i], &pks[i-1]);
-    }
-
-    // map all hashes to G1 and aggregate the ones with the same public key
-    
-    // tmp_hashes is a temporary array of all hashes under a same key mapped to a G1 point.
-    // tmp_hashes size is set to the maximum possible size to minimize malloc calls.
-    int tmp_hashes_size = hashes_per_pk[0];
-    for (int i=1; i<nb_pks; i++) {
-        if (hashes_per_pk[i] > tmp_hashes_size) {
-            tmp_hashes_size = hashes_per_pk[i];
-        }
-    }
-    E1* tmp_hashes = (E1*)malloc(tmp_hashes_size * sizeof(E1));
-    if (!tmp_hashes) {
-        ret = UNDEFINED;
-        goto out;
-    }
-
-    // sum hashes under the same key
-    int data_offset = 0;
-    int index_offset = 0;
-    for (int i=1; i < nb_pks+1; i++) {
-        for (int j=0; j < hashes_per_pk[i-1]; j++) {
-            // map the hash to G1
-            map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]); 
-            data_offset += len_hashes[index_offset];
-            index_offset++; 
-        }
-        // aggregate all the points of the array 
-        E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i-1]);
-    }
-    free(tmp_hashes);
-
-    // multi pairing
-    Fp12 e;
-    Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks+1);
-    
-    if (Fp12_is_one(&e)) {
-        ret = VALID;
-    } else {
-        ret = INVALID;
-    }
+  if (Fp12_is_one(&e)) {
+    ret = VALID;
+  } else {
+    ret = INVALID;
+  }
 
 out:
-    free(elemsG2);
+  free(elemsG2);
 outG2:
-    free(elemsG1);
+  free(elemsG1);
 outG1:
-    return ret;
+  return ret;
 }
 
 // Verifies a BLS signature in a byte buffer.
 // membership check of the signature in G1 is verified.
 // membership check of pk in G2 is not verified in this function.
-// the membership check in G2 is separated to optimize multiple verifications using the same key.
-// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. 
-int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) {  
-    E1 s, h;
-    // deserialize the signature into a curve point
-    if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) {
-        return INVALID;
-    }
+// the membership check in G2 is separated to optimize multiple verifications
+// using the same key. `hash` represents the hashed message with length
+// `hash_len` equal to `MAP_TO_G1_INPUT_LEN`.
+int bls_verify(const E2 *pk, const byte *sig, const byte *hash,
+               const int hash_len) {
+  E1 s, h;
+  // deserialize the signature into a curve point
+  if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) {
+    return INVALID;
+  }
 
-    // check s is in G1
-    if (!E1_in_G1(&s)) {
-        return INVALID;
-    }
+  // check s is in G1
+  if (!E1_in_G1(&s)) {
+    return INVALID;
+  }
 
-    if (map_to_G1(&h, hash, hash_len) != VALID) {
-        return INVALID;
-    }
-    
-    return bls_verify_E1(pk, &s, &h);
-}
+  if (map_to_G1(&h, hash, hash_len) != VALID) {
+    return INVALID;
+  }
 
+  return bls_verify_E1(pk, &s, &h);
+}
 
 // binary tree structure to be used by bls_batch verify.
-// Each node contains a signature and a public key, the signature (resp. the public key) 
-// being the aggregated signature of the two children's signature (resp. public keys).
-// The leaves contain the initial signatures and public keys.
-typedef struct st_node { 
-    E1* sig;
-    E2* pk;  
-    struct st_node* left; 
-    struct st_node* right; 
+// Each node contains a signature and a public key, the signature (resp. the
+// public key) being the aggregated signature of the two children's signature
+// (resp. public keys). The leaves contain the initial signatures and public
+// keys.
+typedef struct st_node {
+  E1 *sig;
+  E2 *pk;
+  struct st_node *left;
+  struct st_node *right;
 } node;
 
-static node* new_node(const E2* pk, const E1* sig){
-    node* t = (node*) malloc(sizeof(node));
-    if (t) {
-        t->pk = (E2*)pk;
-        t->sig = (E1*)sig;
-        t->right = t->left = NULL;
-    }
-    return t;
+static node *new_node(const E2 *pk, const E1 *sig) {
+  node *t = (node *)malloc(sizeof(node));
+  if (t) {
+    t->pk = (E2 *)pk;
+    t->sig = (E1 *)sig;
+    t->right = t->left = NULL;
+  }
+  return t;
 }
 
-static void free_tree(node* root) {
-    if (!root) return;
-
-    // only free pks and sigs of non-leafs, data of leafs are allocated 
-    // as an entire array in `bls_batch_verify`.
-    if (root->left) {   // no need to check the right child for the leaf check because
-                        //  the recursive build starts with the left side first
-        // pointer free
-        free(root->sig);
-        free(root->pk);
-        // free the children nodes
-        free_tree(root->left);
-        free_tree(root->right);
-    }
-    free(root);
+static void free_tree(node *root) {
+  if (!root)
+    return;
+
+  // only free pks and sigs of non-leafs, data of leafs are allocated
+  // as an entire array in `bls_batch_verify`.
+  if (root->left) { // no need to check the right child for the leaf check
+                    // because
+                    //  the recursive build starts with the left side first
+    // pointer free
+    free(root->sig);
+    free(root->pk);
+    // free the children nodes
+    free_tree(root->left);
+    free_tree(root->right);
+  }
+  free(root);
 }
 
-// builds a binary tree of aggregation of signatures and public keys recursively.
-static node* build_tree(const int len, const E2* pks, const E1* sigs) {
-    // check if a leaf is reached
-    if (len == 1) {
-        return new_node(&pks[0], &sigs[0]);  // use the first element of the arrays
-    }
-
-    // a leaf is not reached yet, 
-    int right_len = len/2;
-    int left_len = len - right_len;
-
-    // create a new node with new points
-    E2* new_pk = (E2*)malloc(sizeof(E2));
-    if (!new_pk) {goto error;}
-    E1* new_sig = (E1*)malloc(sizeof(E1));
-    if (!new_sig) {goto error_sig;}
-
-    node* t = new_node(new_pk, new_sig);
-    if (!t) goto error_node;
-
-    // build the tree in a top-down way
-    t->left = build_tree(left_len, &pks[0], &sigs[0]);
-    if (!t->left) { free_tree(t); goto error; }
-
-    t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]);
-    if (!t->right) { free_tree(t); goto error; }
-    // sum the children
-    E1_add(t->sig, t->left->sig, t->right->sig);
-    E2_add(t->pk, t->left->pk, t->right->pk); 
-    return t;
+// builds a binary tree of aggregation of signatures and public keys
+// recursively.
+static node *build_tree(const int len, const E2 *pks, const E1 *sigs) {
+  // check if a leaf is reached
+  if (len == 1) {
+    return new_node(&pks[0], &sigs[0]); // use the first element of the arrays
+  }
+
+  // a leaf is not reached yet,
+  int right_len = len / 2;
+  int left_len = len - right_len;
+
+  // create a new node with new points
+  E2 *new_pk = (E2 *)malloc(sizeof(E2));
+  if (!new_pk) {
+    goto error;
+  }
+  E1 *new_sig = (E1 *)malloc(sizeof(E1));
+  if (!new_sig) {
+    goto error_sig;
+  }
+
+  node *t = new_node(new_pk, new_sig);
+  if (!t)
+    goto error_node;
+
+  // build the tree in a top-down way
+  t->left = build_tree(left_len, &pks[0], &sigs[0]);
+  if (!t->left) {
+    free_tree(t);
+    goto error;
+  }
+
+  t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]);
+  if (!t->right) {
+    free_tree(t);
+    goto error;
+  }
+  // sum the children
+  E1_add(t->sig, t->left->sig, t->right->sig);
+  E2_add(t->pk, t->left->pk, t->right->pk);
+  return t;
 
 error_node:
-    free(new_sig);
+  free(new_sig);
 error_sig:
-    free(new_pk);
+  free(new_pk);
 error:
-    return NULL;
+  return NULL;
 }
 
-// verify the binary tree and fill the results using recursive batch verifications.
-static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) {
-    // verify the aggregated signature against the aggregated public key.
-    int res =  bls_verify_E1(root->pk, root->sig, h);
-
-    // if the result is valid, all the subtree signatures are valid.
-    if (res == VALID) {
-        for (int i=0; i < len; i++) {
-            if (results[i] == UNDEFINED) results[i] = VALID; // do not overwrite invalid results
-        }
-        return;
+// verify the binary tree and fill the results using recursive batch
+// verifications.
+static void bls_batch_verify_tree(const node *root, const int len,
+                                  byte *results, const E1 *h) {
+  // verify the aggregated signature against the aggregated public key.
+  int res = bls_verify_E1(root->pk, root->sig, h);
+
+  // if the result is valid, all the subtree signatures are valid.
+  if (res == VALID) {
+    for (int i = 0; i < len; i++) {
+      if (results[i] == UNDEFINED)
+        results[i] = VALID; // do not overwrite invalid results
     }
-
-    // check if root is a leaf
-    if (root->left == NULL) { // no need to check the right side
-        *results = INVALID;
-        return;
-    }
-
-    // otherwise, at least one of the subtree signatures is invalid. 
-    // use the binary tree structure to find the invalid signatures. 
-    int right_len = len/2;
-    int left_len = len - right_len;
-    bls_batch_verify_tree(root->left, left_len, &results[0], h);
-    bls_batch_verify_tree(root->right, right_len, &results[left_len], h);
+    return;
+  }
+
+  // check if root is a leaf
+  if (root->left == NULL) { // no need to check the right side
+    *results = INVALID;
+    return;
+  }
+
+  // otherwise, at least one of the subtree signatures is invalid.
+  // use the binary tree structure to find the invalid signatures.
+  int right_len = len / 2;
+  int left_len = len - right_len;
+  bls_batch_verify_tree(root->left, left_len, &results[0], h);
+  bls_batch_verify_tree(root->right, right_len, &results[left_len], h);
 }
 
-// Batch verifies the validity of a multiple BLS signatures of the 
-// same message under multiple public keys. Each signature at index `i` is verified
-// against the public key at index `i`.
-// `seed` is used as the entropy source for randoms required by the computation. The function
-// assumes the source size is at least (16*sigs_len) of random bytes of entropy at least 128 bits.
+// Batch verifies the validity of a multiple BLS signatures of the
+// same message under multiple public keys. Each signature at index `i` is
+// verified against the public key at index `i`. `seed` is used as the entropy
+// source for randoms required by the computation. The function assumes the
+// source size is at least (16*sigs_len) of random bytes of entropy at least 128
+// bits.
 //
 // - membership checks of all signatures is verified upfront.
-// - use random coefficients for signatures and public keys at the same index to prevent 
+// - use random coefficients for signatures and public keys at the same index to
+// prevent
 //  indices mixup.
-// - optimize the verification by verifying an aggregated signature against an aggregated
-//  public key, and use a recursive verification to find invalid signatures.  
-void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input,
-     const byte* sigs_bytes, const byte* data, const int data_len, const byte* seed) {  
-    
-    // initialize results to undefined
-    memset(results, UNDEFINED, sigs_len);
-    
-    // build the arrays of G1 and G2 elements to verify
-    E2* pks = (E2*) malloc(sigs_len * sizeof(E2));
-    if (!pks) return;
-    E1* sigs = (E1*) malloc(sigs_len * sizeof(E1));
-    if (!sigs) goto out_sigs;
-
-    for (int i=0; i < sigs_len; i++) {
-        // convert the signature points:
-        // - invalid points are stored as infinity points with an invalid result, so that
-        // the tree aggregations remain valid.
-        // - valid points are multiplied by a random scalar (same for public keys at same index)
-        // to make sure a signature at index (i) is verified against the public key at the same index.
-        int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES);
-        if (read_ret != VALID || !E1_in_G1(&sigs[i])) {
-            // set signature and key to infinity (no effect on the aggregation tree)
-            // and set result to invalid (result won't be overwritten)
-            E2_set_infty(&pks[i]);
-            E1_set_infty(&sigs[i]);   
-            results[i] = INVALID; 
-        } else {
-            // choose a random non-zero coefficient of at least 128 bits
-            Fr r, one;
-            // r = random, i-th seed is used for i-th signature
-            Fr_set_zero(&r);
-            const int seed_len = SEC_BITS/8;
-            limbs_from_be_bytes((limb_t*)&r, seed + (seed_len*i), seed_len);  // faster shortcut than Fr_map_bytes
-            // r = random + 1
-            Fr_set_limb(&one, 1);
-            Fr_add(&r, &r, &one); 
-            // multiply public key and signature by the same random exponent r
-            E2_mult(&pks[i], &pks_input[i], &r);
-            E1_mult(&sigs[i], &sigs[i], &r);   
-        } 
-    }
-    // build a binary tree of aggreagtions
-    node* root = build_tree(sigs_len, &pks[0], &sigs[0]);
-    if (!root) goto out;
-
-    E1 h;
-    if (map_to_G1(&h, data, data_len) != VALID) {
-        goto out;
+// - optimize the verification by verifying an aggregated signature against an
+// aggregated
+//  public key, and use a recursive verification to find invalid signatures.
+void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
+                      const byte *sigs_bytes, const byte *data,
+                      const int data_len, const byte *seed) {
+
+  // initialize results to undefined
+  memset(results, UNDEFINED, sigs_len);
+
+  // build the arrays of G1 and G2 elements to verify
+  E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2));
+  if (!pks)
+    return;
+  E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1));
+  if (!sigs)
+    goto out_sigs;
+
+  for (int i = 0; i < sigs_len; i++) {
+    // convert the signature points:
+    // - invalid points are stored as infinity points with an invalid result, so
+    // that the tree aggregations remain valid.
+    // - valid points are multiplied by a random scalar (same for public keys at
+    // same index) to make sure a signature at index (i) is verified against the
+    // public key at the same index.
+    int read_ret =
+        E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES);
+    if (read_ret != VALID || !E1_in_G1(&sigs[i])) {
+      // set signature and key to infinity (no effect on the aggregation tree)
+      // and set result to invalid (result won't be overwritten)
+      E2_set_infty(&pks[i]);
+      E1_set_infty(&sigs[i]);
+      results[i] = INVALID;
+    } else {
+      // choose a random non-zero coefficient of at least 128 bits
+      Fr r, one;
+      // r = random, i-th seed is used for i-th signature
+      Fr_set_zero(&r);
+      const int seed_len = SEC_BITS / 8;
+      limbs_from_be_bytes((limb_t *)&r, seed + (seed_len * i),
+                          seed_len); // faster shortcut than Fr_map_bytes
+      // r = random + 1
+      Fr_set_limb(&one, 1);
+      Fr_add(&r, &r, &one);
+      // multiply public key and signature by the same random exponent r
+      E2_mult(&pks[i], &pks_input[i], &r);
+      E1_mult(&sigs[i], &sigs[i], &r);
     }
-
-    // verify the binary tree and fill the results using batch verification
-    bls_batch_verify_tree(root, sigs_len, &results[0], &h);
-    // free the allocated tree 
-    free_tree(root); 
+  }
+  // build a binary tree of aggreagtions
+  node *root = build_tree(sigs_len, &pks[0], &sigs[0]);
+  if (!root)
+    goto out;
+
+  E1 h;
+  if (map_to_G1(&h, data, data_len) != VALID) {
+    goto out;
+  }
+
+  // verify the binary tree and fill the results using batch verification
+  bls_batch_verify_tree(root, sigs_len, &results[0], &h);
+  // free the allocated tree
+  free_tree(root);
 out:
-    free(sigs); 
+  free(sigs);
 out_sigs:
-    free(pks);
+  free(pks);
 }
 
 // Verifies the validity of 2 SPoCK proofs and 2 public keys.
 // Membership check in G1 of both proofs is verified in this function.
 // Membership check in G2 of both keys is not verified in this function.
-// the membership check in G2 is separated to allow optimizing multiple verifications 
-// using the same public keys.
-int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) {  
-    E1 elemsG1[2];
-    E2 elemsG2[2];
-
-    // elemsG1[0] = s1
-    if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) {
-        return INVALID;
-    };
-    // check s1 is in G1
-    if (!E1_in_G1(&elemsG1[0]))  {
-        return INVALID;
-    }
+// the membership check in G2 is separated to allow optimizing multiple
+// verifications using the same public keys.
+int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2,
+                     const byte *sig2) {
+  E1 elemsG1[2];
+  E2 elemsG2[2];
+
+  // elemsG1[0] = s1
+  if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) {
+    return INVALID;
+  };
+  // check s1 is in G1
+  if (!E1_in_G1(&elemsG1[0])) {
+    return INVALID;
+  }
 
-    // elemsG1[1] = s2
-    if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) {
-        return INVALID;
-    };
-    // check s2 is in G1
-    if (!E1_in_G1(&elemsG1[1]))  {
-        return INVALID;
-    }
+  // elemsG1[1] = s2
+  if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) {
+    return INVALID;
+  };
+  // check s2 is in G1
+  if (!E1_in_G1(&elemsG1[1])) {
+    return INVALID;
+  }
 
-    // elemsG2[1] = pk1
-    E2_copy(&elemsG2[1], pk1);
+  // elemsG2[1] = pk1
+  E2_copy(&elemsG2[1], pk1);
 
-    // elemsG2[0] = -pk2
-    E2_neg(&elemsG2[0], pk2);
+  // elemsG2[0] = -pk2
+  E2_neg(&elemsG2[0], pk2);
 
-    // double pairing
-    Fp12 e;
-    Fp12_multi_pairing(&e, elemsG1 , elemsG2, 2);
+  // double pairing
+  Fp12 e;
+  Fp12_multi_pairing(&e, elemsG1, elemsG2, 2);
 
-    if (Fp12_is_one(&e)) {
-        return VALID; 
-    } 
-    return INVALID; 
+  if (Fp12_is_one(&e)) {
+    return VALID;
+  }
+  return INVALID;
 }
-
diff --git a/crypto/bls_include.h b/crypto/bls_include.h
index c5dba4d45de..af380735237 100644
--- a/crypto/bls_include.h
+++ b/crypto/bls_include.h
@@ -6,17 +6,17 @@
 #include "bls12381_utils.h"
 
 // BLS signature core (functions in bls_core.c)
-int      bls_sign(byte*, const Fr*, const byte*, const int);
-int      bls_verify(const E2*, const byte*, const byte*, const int);
-int      bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*,
-                         const uint32_t*, const E2*);
-int      bls_verifyPerDistinctKey(const byte*, 
-                         const int, const E2*, const uint32_t*,
-                         const byte*, const uint32_t*);
-void     bls_batch_verify(const int, byte*, const E2*,
-            const byte*, const byte*, const int, const byte*);
+int bls_sign(byte *, const Fr *, const byte *, const int);
+int bls_verify(const E2 *, const byte *, const byte *, const int);
+int bls_verifyPerDistinctMessage(const byte *, const int, const byte *,
+                                 const uint32_t *, const uint32_t *,
+                                 const E2 *);
+int bls_verifyPerDistinctKey(const byte *, const int, const E2 *,
+                             const uint32_t *, const byte *, const uint32_t *);
+void bls_batch_verify(const int, byte *, const E2 *, const byte *, const byte *,
+                      const int, const byte *);
 
 // BLS based SPoCK
-int      bls_spock_verify(const E2*, const byte*, const E2*, const byte*);
+int bls_spock_verify(const E2 *, const byte *, const E2 *, const byte *);
 
 #endif
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index e951cc9c33f..dc7e1354907 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -1,109 +1,117 @@
 #include "bls_thresholdsign_include.h"
 
 // the highest index of a threshold participant
-#define MAX_IND         255
-#define MAX_IND_BITS    8   // equal to ceiling(log_2(MAX_IND))
+#define MAX_IND 255
+#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND))
 
-// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)]
-// and stores it in `res`, where t is the degree of the polynomial P.
-// `len` is equal to `t+1` where `t` is the polynomial degree.
-static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[], const int len){
+// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range
+// [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the
+// polynomial P. `len` is equal to `t+1` where `t` is the polynomial degree.
+static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
+                                      const byte indices[], const int len) {
 
-    // coefficient is computed as N * D^(-1)
-    Fr numerator;  // eventually would represent N*R^k  
-    Fr denominator; // eventually would represent D*R^k 
+  // coefficient is computed as N * D^(-1)
+  Fr numerator;   // eventually would represent N*R^k
+  Fr denominator; // eventually would represent D*R^k
 
-    // Initialize N and D to Montgomery constant R
-    Fr_copy(&numerator, &BLS12_381_rR);
-    Fr_copy(&denominator, &BLS12_381_rR);
+  // Initialize N and D to Montgomery constant R
+  Fr_copy(&numerator, &BLS12_381_rR);
+  Fr_copy(&denominator, &BLS12_381_rR);
 
-    // sign of D: 0 for positive and 1 for negative
-    int sign = 0; 
+  // sign of D: 0 for positive and 1 for negative
+  int sign = 0;
 
-    // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS)
-    // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing.
-    #define MAX_IND_LOOPS   (64/MAX_IND_BITS)
-    const int loops = MAX_IND_LOOPS;
-    int k,j = 0;
-    Fr tmp;
-    while (j<len) {
-        limb_t limb_numerator = 1;
-        limb_t limb_denominator = 1;
-        for (k = j; j < MIN(len, k+loops); j++){ // batch up to `loops` elements in one limb
-            if (j==i) 
-                continue;
-            if (indices[j] < indices[i]) {
-                sign ^= 1;
-                limb_denominator *= indices[i]-indices[j];
-            } else {
-                limb_denominator *= indices[j]-indices[i];
-            }
-            limb_numerator *= indices[j];
-        }
-        // numerator and denominator are both computed in Montgomery form.
-        // update numerator
-        Fr_set_limb(&tmp, limb_numerator); // L_N
-        Fr_to_montg(&tmp, &tmp);  // L_N*R
-        Fr_mul_montg(&numerator, &numerator, &tmp); // N*R
-        // update denominator
-        Fr_set_limb(&tmp, limb_denominator); // L_D
-        Fr_to_montg(&tmp, &tmp);  // L_D*R
-        Fr_mul_montg(&denominator, &denominator, &tmp); // D*R
-    }
-    if (sign) {
-        Fr_neg(&denominator, &denominator);
+// the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately
+// 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64
+// bits) without overflowing.
+#define MAX_IND_LOOPS (64 / MAX_IND_BITS)
+  const int loops = MAX_IND_LOOPS;
+  int k, j = 0;
+  Fr tmp;
+  while (j < len) {
+    limb_t limb_numerator = 1;
+    limb_t limb_denominator = 1;
+    for (k = j; j < MIN(len, k + loops);
+         j++) { // batch up to `loops` elements in one limb
+      if (j == i)
+        continue;
+      if (indices[j] < indices[i]) {
+        sign ^= 1;
+        limb_denominator *= indices[i] - indices[j];
+      } else {
+        limb_denominator *= indices[j] - indices[i];
+      }
+      limb_numerator *= indices[j];
     }
+    // numerator and denominator are both computed in Montgomery form.
+    // update numerator
+    Fr_set_limb(&tmp, limb_numerator);          // L_N
+    Fr_to_montg(&tmp, &tmp);                    // L_N*R
+    Fr_mul_montg(&numerator, &numerator, &tmp); // N*R
+    // update denominator
+    Fr_set_limb(&tmp, limb_denominator);            // L_D
+    Fr_to_montg(&tmp, &tmp);                        // L_D*R
+    Fr_mul_montg(&denominator, &denominator, &tmp); // D*R
+  }
+  if (sign) {
+    Fr_neg(&denominator, &denominator);
+  }
 
-    // at this point, denominator = D*R , numertaor = N*R
-    // inversion inv(x) = x^(-1)R
-    Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1)
-    Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1)     
+  // at this point, denominator = D*R , numertaor = N*R
+  // inversion inv(x) = x^(-1)R
+  Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1)
+  Fr_mul_montg(res, &numerator, &denominator);   // N*D^(-1)
 }
 
-// Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] 
-// and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`.
-// `len` is equal to `t+1` where `t` is the polynomial degree.
-static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const byte indices[], const int len) {
-    // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1 
-    // where A_i = g1 ^ a_i
+// Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the
+// indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)],
+// and stores the resulting G1 point in `dest`. `len` is equal to `t+1` where
+// `t` is the polynomial degree.
+static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
+                                            const byte indices[],
+                                            const int len) {
+  // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1
+  // where A_i = g1 ^ a_i
 
-    // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0)
-    // where L is the Lagrange coefficient
-    
-    E1_set_infty(out);
-    Fr fr_lagr_coef;
-    E1 mult; 
-    for (int i=0; i < len; i++) { 
-        Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); 
-        E1_mult(&mult, &shares[i], &fr_lagr_coef);
-        E1_add(out, out, &mult);
-    }
+  // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0)
+  // where L is the Lagrange coefficient
+
+  E1_set_infty(out);
+  Fr fr_lagr_coef;
+  E1 mult;
+  for (int i = 0; i < len; i++) {
+    Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len);
+    E1_mult(&mult, &shares[i], &fr_lagr_coef);
+    E1_add(out, out, &mult);
+  }
 }
 
-// Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] 
-// and writes their E1 concatenated serializations [shares(1)..shares(t+1)] in `dest`.
-// `len` is equal to `t+1` where `t` is the polynomial degree.
-int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const byte indices[], const int len) {
-    int read_ret;
-    E1* E1_shares = malloc(sizeof(E1) * len);
-    for (int i=0; i < len; i++) {
-        read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
-        if (read_ret != VALID) {
-            goto out;
-        }
+// Computes the Langrange interpolation at zero LI(0) with regards to the
+// indices [indices(0)..indices(t)] and writes their E1 concatenated
+// serializations [shares(1)..shares(t+1)] in `dest`. `len` is equal to `t+1`
+// where `t` is the polynomial degree.
+int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares,
+                                          const byte indices[], const int len) {
+  int read_ret;
+  E1 *E1_shares = malloc(sizeof(E1) * len);
+  for (int i = 0; i < len; i++) {
+    read_ret =
+        E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
+    if (read_ret != VALID) {
+      goto out;
     }
+  }
 
-    // G1 interpolation at 0
-    // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
-    // where A_i = g1 ^ a_i
-    E1 res;
-    E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len);
-    // export the result
-    E1_write_bytes(dest, &res);
-    read_ret = VALID;
+  // G1 interpolation at 0
+  // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
+  // where A_i = g1 ^ a_i
+  E1 res;
+  E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len);
+  // export the result
+  E1_write_bytes(dest, &res);
+  read_ret = VALID;
 out:
-    // free the temp memory
-    free(E1_shares); 
-    return read_ret;
+  // free the temp memory
+  free(E1_shares);
+  return read_ret;
 }
-
diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h
index 7c27c3b97b8..d41779dab25 100644
--- a/crypto/bls_thresholdsign_include.h
+++ b/crypto/bls_thresholdsign_include.h
@@ -3,7 +3,9 @@
 
 #include "bls_include.h"
 
-int             E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const byte[], const int);
-extern void     Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x);
+int E1_lagrange_interpolate_at_zero_write(byte *, const byte *, const byte[],
+                                          const int);
+extern void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int a_size,
+                                const byte x);
 
 #endif
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index e3c0bb9701a..1f0ed3b17ce 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -2,9 +2,9 @@
 #define __BLST_INCLUDE_H__
 
 // BLST src headers
-#include "point.h"
-#include "fields.h"
 #include "consts.h"
+#include "fields.h"
+#include "point.h"
 
 // types used by the Flow crypto library that are imported from BLST
 // these type definitions are used as an abstraction from BLST internal types
@@ -13,46 +13,53 @@
 // where `r` is the order of G1/G2.
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
 // are represented as a little endian vector of limbs.
-// `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements).
-// `Fr` is defined as a struct to be exportable through cgo to the Go layer.
+// `Fr` is equivalent to type `vec256` (used internally by BLST for F_r
+// elements). `Fr` is defined as a struct to be exportable through cgo to the Go
+// layer.
 #define R_BITS 255 // equal to Fr_bits in bls12381_utils.h
-typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; 
+typedef struct {
+  limb_t limbs[(R_BITS + 63) / 64];
+} Fr;
 
 // field elements F_p
 // F_p elements are represented as big numbers reduced modulo `p`. Big numbers
 // are represented as a little endian vector of limbs.
-// `Fp` is equivalent to type `vec384` (used internally by BLST for F_p elements).
-// `Fp` does not need to be exported to cgo.
+// `Fp` is equivalent to type `vec384` (used internally by BLST for F_p
+// elements). `Fp` does not need to be exported to cgo.
 typedef vec384 Fp;
 
 // curve E_1 (over F_p)
-// E_1 points are represented in Jacobian coordinates (x,y,z), 
+// E_1 points are represented in Jacobian coordinates (x,y,z),
 // where x, y, x are elements of F_p (type `Fp`).
-// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements)
-// `E1` is defined as a struct to be exportable through cgo to the Go layer.
-// `E1` is also used to represent all subgroup G_1 elements. 
-typedef struct {Fp x,y,z;} E1;
+// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian
+// E1 elements) `E1` is defined as a struct to be exportable through cgo to the
+// Go layer. `E1` is also used to represent all subgroup G_1 elements.
+typedef struct {
+  Fp x, y, z;
+} E1;
 
 // field elements F_p^2
 // F_p^2 elements are represented as a vector of two F_p elements.
-// `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 elements).
-// `Fp2` does not need to be exported to cgo.
-typedef vec384x Fp2;   
+// `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2
+// elements). `Fp2` does not need to be exported to cgo.
+typedef vec384x Fp2;
 // helpers to get "real" and "imaginary" Fp elements from Fp2 pointers
-#define real(p)  ((*(p))[0])  
-#define imag(p)  ((*(p))[1]) 
-
+#define real(p) ((*(p))[0])
+#define imag(p) ((*(p))[1])
 
 // curve E_2 (over F_p^2)
-// E_2 points are represented in Jacobian coordinates (x,y,z), 
+// E_2 points are represented in Jacobian coordinates (x,y,z),
 // where x, y, x are elements of F_p (type `Fp`).
-// `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E2 elements)
-// `E2` is defined as a struct to be exportable through cgo to the Go layer.
-// `E2` is also used to represent all subgroup G_2 elements. 
-typedef struct {Fp2 x,y,z;} E2;
+// `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian
+// E2 elements) `E2` is defined as a struct to be exportable through cgo to the
+// Go layer. `E2` is also used to represent all subgroup G_2 elements.
+typedef struct {
+  Fp2 x, y, z;
+} E2;
 
 // Fp12 is the codomain of the pairing function `e`, specifically the subgroup
 // G_T of Fp12.
-// Fp12 represents G_T elements and is equivalent to `vec384fp12` (used internally by BLST)
+// Fp12 represents G_T elements and is equivalent to `vec384fp12` (used
+// internally by BLST)
 typedef vec384fp12 Fp12;
 #endif
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 674973e1d8a..811f9c84653 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -3,83 +3,87 @@
 // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r
 // where `x` is a small integer (byte) and `degree` is P's degree n.
 // P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL.
-void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int degree, const byte x){
-    Fr image;
-    Fr_polynomial_image(&image, y, a, degree, x);
-    // exports the result
-    Fr_write_bytes(out, &image);
+void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree,
+                               const byte x) {
+  Fr image;
+  Fr_polynomial_image(&image, y, a, degree, x);
+  // exports the result
+  Fr_write_bytes(out, &image);
 }
 
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
-// a_i are all in Fr, `degree` is P's degree, x is a small integer less than 255.
-// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL
-void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int degree, const byte x){
-    Fr_set_zero(image); 
-    // convert `x` to Montgomery form
-    Fr xR;
-    Fr_set_limb(&xR, (limb_t)x);
-    Fr_to_montg(&xR, &xR);
+// a_i are all in Fr, `degree` is P's degree, x is a small integer less than
+// 255. The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non
+// NULL
+void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree,
+                         const byte x) {
+  Fr_set_zero(image);
+  // convert `x` to Montgomery form
+  Fr xR;
+  Fr_set_limb(&xR, (limb_t)x);
+  Fr_to_montg(&xR, &xR);
 
-    for (int i = degree; i >= 0; i--) {
-        Fr_mul_montg(image, image, &xR); 
-        Fr_add(image, image, &a[i]); // image is in normal form
-    }
-    // compute y = P(x).g2
-    if (y) {
-        G2_mult_gen(y, image);
-    }
+  for (int i = degree; i >= 0; i--) {
+    Fr_mul_montg(image, image, &xR);
+    Fr_add(image, image, &a[i]); // image is in normal form
+  }
+  // compute y = P(x).g2
+  if (y) {
+    G2_mult_gen(y, image);
+  }
 }
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
 // and stores the point in y
-static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){        
-    E2_set_infty(y);
-    for (int i = degree; i >= 0 ; i--) {
-        E2_mult_small_expo(y, y, x);
-        E2_add(y, y, &A[i]);
-    }
+static void E2_polynomial_image(E2 *y, const E2 *A, const int degree,
+                                const byte x) {
+  E2_set_infty(y);
+  for (int i = degree; i >= 0; i--) {
+    E2_mult_small_expo(y, y, x);
+    E2_add(y, y, &A[i]);
+  }
 }
 
-
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
 // where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
-void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int degree) {
-    for (byte i=0; i<len_y; i++) {
-        //y[i] = Q(i+1)
-        E2_polynomial_image(y+i , A, degree, i+1);
-    }
+void E2_polynomial_images(E2 *y, const int len_y, const E2 *A,
+                          const int degree) {
+  for (byte i = 0; i < len_y; i++) {
+    // y[i] = Q(i+1)
+    E2_polynomial_image(y + i, A, degree, i + 1);
+  }
 }
 
 // export an array of G2 into an array of bytes by concatenating
 // all serializations of G2 points in order.
 // the array must be of length (len * G2_SER_BYTES).
-void G2_vector_write_bytes(byte* out, const E2* A, const int len) {
-    byte* p = out;
-    for (int i=0; i<len; i++){
-        E2_write_bytes(p, &A[i]);
-        p += G2_SER_BYTES;
-    }
+void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
+  byte *p = out;
+  for (int i = 0; i < len; i++) {
+    E2_write_bytes(p, &A[i]);
+    p += G2_SER_BYTES;
+  }
 }
 
-// The function imports an array of E2 points from a concatenated array of bytes.
-// The bytes array is supposed to be in (len * G2_SER_BYTES) 
-ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){
-    byte* p = (byte*) src;
-    for (int i=0; i<len; i++){
-        int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
-        if (read_ret != VALID)
-            return read_ret;
-        p += G2_SER_BYTES;
-    }
-    // TODO: add G2 subgroup check?
-    return VALID;
+// The function imports an array of E2 points from a concatenated array of
+// bytes. The bytes array is supposed to be in (len * G2_SER_BYTES)
+ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len) {
+  byte *p = (byte *)src;
+  for (int i = 0; i < len; i++) {
+    int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
+    if (read_ret != VALID)
+      return read_ret;
+    p += G2_SER_BYTES;
+  }
+  // TODO: add G2 subgroup check?
+  return VALID;
 }
 
 // checks the discrete log relationship in G2.
 // - returns 1 if g2^x = y, where g2 is the generator of G2
 // - returns 0 otherwise.
-bool G2_check_log(const Fr* x, const E2* y) {
-    E2 tmp;
-    G2_mult_gen(&tmp, x);
-    return E2_is_equal(&tmp, y);
+bool G2_check_log(const Fr *x, const E2 *y) {
+  E2 tmp;
+  G2_mult_gen(&tmp, x);
+  return E2_is_equal(&tmp, y);
 }
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 7cd2b8b7d2d..fc377f26b4f 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -3,11 +3,13 @@
 
 #include "bls12381_utils.h"
 
-void        Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int deg, const byte x);
-void        Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, const byte x);
-void        E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg);
-void        G2_vector_write_bytes(byte* out, const E2* A, const int len);
-ERROR  E2_vector_read_bytes(E2* A, const byte* src, const int len);
-bool      G2_check_log(const Fr* x, const E2* y);
+void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int deg,
+                               const byte x);
+void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg,
+                         const byte x);
+void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg);
+void G2_vector_write_bytes(byte *out, const E2 *A, const int len);
+ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len);
+bool G2_check_log(const Fr *x, const E2 *y);
 
 #endif

From 30d781ccd7911fab7636749777047b2a2c2cca76 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 14:14:43 -0600
Subject: [PATCH 141/200] remove clanf-format config file

---
 crypto/.clang-format | 192 -------------------------------------------
 1 file changed, 192 deletions(-)
 delete mode 100644 crypto/.clang-format

diff --git a/crypto/.clang-format b/crypto/.clang-format
deleted file mode 100644
index 48b2c678323..00000000000
--- a/crypto/.clang-format
+++ /dev/null
@@ -1,192 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  LLVM
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveMacros: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
-AlignEscapedNewlines: Right
-AlignOperands:   Align
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortBlocksOnASingleLine: Never
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-AttributeMacros:
-  - __capability
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: Never
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  BeforeLambdaBody: false
-  BeforeWhile:     false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeConceptDeclarations: true
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-QualifierAlignment: Leave
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DeriveLineEnding: true
-DerivePointerAlignment: false
-DisableFormat:   false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-ExperimentalAutoDetectBinPacking: false
-PackConstructorInitializers: BinPack
-BasedOnStyle:    ''
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-AllowAllConstructorInitializersOnNextLine: true
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IfMacros:
-  - KJ_IF_MAYBE
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '.*'
-    Priority:        1
-    SortPriority:    0
-    CaseSensitive:   false
-IncludeIsMainRegex: '(Test)?$'
-IncludeIsMainSourceRegex: ''
-IndentAccessModifiers: false
-IndentCaseLabels: false
-IndentCaseBlocks: false
-IndentGotoLabels: true
-IndentPPDirectives: None
-IndentExternBlock: AfterExternBlock
-IndentRequires:  false
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-InsertTrailingCommas: None
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-LambdaBodyIndentation: Signature
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCBreakBeforeNestedBlockParam: true
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PenaltyIndentedWhitespace: 0
-PointerAlignment: Right
-PPIndentWidth:   -1
-ReferenceAlignment: Pointer
-ReflowComments:  true
-RemoveBracesLLVM: false
-SeparateDefinitionBlocks: Leave
-ShortNamespaceLines: 1
-SortIncludes:    CaseSensitive
-SortJavaStaticImport: Before
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeParensOptions:
-  AfterControlStatements: true
-  AfterForeachMacros: true
-  AfterFunctionDefinitionName: false
-  AfterFunctionDeclarationName: false
-  AfterIfMacros:   true
-  AfterOverloadedOperator: false
-  BeforeNonEmptyParentheses: false
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  Never
-SpacesInConditionalStatement: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInLineCommentPrefix:
-  Minimum:         1
-  Maximum:         -1
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpaceBeforeSquareBrackets: false
-BitFieldColonSpacing: Both
-Standard:        Latest
-StatementAttributeLikeMacros:
-  - Q_EMIT
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseCRLF:         false
-UseTab:          Never
-WhitespaceSensitiveMacros:
-  - STRINGIZE
-  - PP_STRINGIZE
-  - BOOST_PP_STRINGIZE
-  - NS_SWIFT_NAME
-  - CF_SWIFT_NAME
-...
-

From 647c0c2222263680aede664b71ee8ca0afd0b2f0 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 18 Aug 2023 14:20:29 -0600
Subject: [PATCH 142/200] add crypto code formatting check to CI

---
 Makefile        | 2 +-
 crypto/Makefile | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c927ff4403a..a84d3f0d276 100644
--- a/Makefile
+++ b/Makefile
@@ -205,7 +205,7 @@ generate-mocks: install-mock-generators
 tidy:
 	go mod tidy -v
 	cd integration; go mod tidy -v
-	cd crypto; go mod tidy -v
+	$(MAKE) -C crypto tidy
 	cd cmd/testclient; go mod tidy -v
 	cd insecure; go mod tidy -v
 	git diff --exit-code
diff --git a/crypto/Makefile b/crypto/Makefile
index 28e7a5f6f2f..ffde0fa6b57 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -29,7 +29,7 @@ else
 endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
-# format
+# format C code
 .PHONY: c-format
 c-format:
 	clang-format -style=llvm -dump-config > .clang-format
@@ -38,6 +38,13 @@ c-format:
 	rm -f .clang-format
 	git diff --exit-code
 
+# tidy Go and C code
+.PHONY: tidy
+tidy: c-format
+	go mod tidy -v
+	git diff --exit-code
+
+
 # test all packages
 .PHONY: test
 test:

From 63cedc8ffd5e52df559a0160613447a0481b4ca1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 21 Aug 2023 10:15:50 -0600
Subject: [PATCH 143/200] move c formatting to linter target

---
 .github/workflows/ci.yml |  5 +++++
 Makefile                 |  2 +-
 crypto/Makefile          | 15 ++++++++++++---
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 57b0da2ace2..9bc0e30291e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,6 +38,10 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
+    - name: Install C formatter
+      run: sudo apt-get install -y clang-format
+    - name: Run C formatter for ./crypto
+      run: make -C crypto c-format
     - name: Run go generate
       run: go generate
       working-directory: ${{ matrix.dir }}
@@ -50,6 +54,7 @@ jobs:
         working-directory: ${{ matrix.dir }}
         # https://github.com/golangci/golangci-lint-action/issues/244
         skip-cache: true
+    
 
   tidy:
     name: Tidy
diff --git a/Makefile b/Makefile
index a84d3f0d276..9cb7ac5fac9 100644
--- a/Makefile
+++ b/Makefile
@@ -205,7 +205,7 @@ generate-mocks: install-mock-generators
 tidy:
 	go mod tidy -v
 	cd integration; go mod tidy -v
-	$(MAKE) -C crypto tidy
+	cd crypo; go mod tidy -v
 	cd cmd/testclient; go mod tidy -v
 	cd insecure; go mod tidy -v
 	git diff --exit-code
diff --git a/crypto/Makefile b/crypto/Makefile
index ffde0fa6b57..be839a18118 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -38,12 +38,21 @@ c-format:
 	rm -f .clang-format
 	git diff --exit-code
 
-# tidy Go and C code
-.PHONY: tidy
-tidy: c-format
+# Go tidy
+.PHONY: go-tidy
+go-tidy:
 	go mod tidy -v
 	git diff --exit-code
 
+# Go lint
+.PHONY: go-lint
+go-lint:
+lint: go-tidy
+	# revive -config revive.toml
+	golangci-lint run -v ./...
+	
+	
+
 
 # test all packages
 .PHONY: test

From a8666e4c97cede542b2e431e60a90181ff2c4882 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 21 Aug 2023 10:57:50 -0600
Subject: [PATCH 144/200] fix linter error

---
 crypto/bls_test.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crypto/bls_test.go b/crypto/bls_test.go
index 0ead9fd3100..aa1e171b216 100644
--- a/crypto/bls_test.go
+++ b/crypto/bls_test.go
@@ -660,7 +660,6 @@ func TestBLSBatchVerify(t *testing.T) {
 	// number of signatures to aggregate
 	sigsNum := rand.Intn(100) + 2
 	sigs := make([]Signature, 0, sigsNum)
-	sks := make([]PrivateKey, 0, sigsNum)
 	pks := make([]PublicKey, 0, sigsNum)
 	expectedValid := make([]bool, 0, sigsNum)
 
@@ -670,7 +669,6 @@ func TestBLSBatchVerify(t *testing.T) {
 		s, err := sk.Sign(input, kmac)
 		require.NoError(t, err)
 		sigs = append(sigs, s)
-		sks = append(sks, sk)
 		pks = append(pks, sk.PublicKey())
 		expectedValid = append(expectedValid, true)
 	}

From 30e5a7b23aeec7c282550cb700a097e44db1a0ba Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 21 Aug 2023 20:55:45 -0600
Subject: [PATCH 145/200] delete unused fermat inversion

---
 crypto/bls12381_utils.c  | 56 ----------------------------------------
 crypto/bls12381_utils.go |  4 +--
 crypto/bls12381_utils.h  |  3 ---
 3 files changed, 2 insertions(+), 61 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 665f3853236..7e1afbf7fbf 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -95,62 +95,6 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
   redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0);
 }
 
-// result is in Montgomery form if base is in montgomery form
-// if base = b*R, res = b^expo * R
-// In general, res = base^expo * R^(-expo+1)
-// `expo` is encoded as a little-endian limb_t table of length `expo_len`.
-// TODO: could be deleted
-void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo,
-                  const int expo_len) {
-  // mask of the most significant bit
-  const limb_t msb_mask = (limb_t)1 << ((sizeof(limb_t) << 3) - 1);
-  limb_t mask = msb_mask;
-  int index = 0;
-
-  expo += expo_len;
-  // process most significant zero limbs
-  while ((index < expo_len) && (*(--expo) == 0)) {
-    index++;
-  }
-  // if expo is zero
-  if (index == expo_len) {
-    Fr_copy(res, base);
-    return;
-  }
-  // expo is non zero
-  // process the most significant zero bits
-  while ((*expo & mask) == 0) {
-    mask >>= 1;
-  }
-  Fr tmp;
-  // process the first `1` bit
-  Fr_copy(&tmp, base);
-  mask >>= 1;
-  // Scan all limbs of the exponent
-  for (; index < expo_len; expo--) {
-    // Scan all bits
-    for (; mask != 0; mask >>= 1) {
-      // square
-      Fr_squ_montg(&tmp, &tmp);
-      // multiply
-      if (*expo & mask) {
-        Fr_mul_montg(&tmp, &tmp, base);
-      }
-    }
-    mask = msb_mask;
-    index++;
-  }
-  Fr_copy(res, &tmp);
-}
-
-// TODO: could be deleted
-void Fr_inv_exp_montg(Fr *res, const Fr *a) {
-  Fr r_2;
-  Fr_copy(&r_2, (Fr *)BLS12_381_r);
-  r_2.limbs[0] -= 2;
-  Fr_exp_montg(res, a, (limb_t *)&r_2, 4);
-}
-
 // computes the sum of the array elements and writes the sum in jointx
 void Fr_sum_vector(Fr *jointx, const Fr x[], const int len) {
   Fr_set_zero(jointx);
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 40580ca7239..a972ca46b64 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -16,8 +16,8 @@ package crypto
 // static void handler(int signum)
 // {	char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n";
 //		ssize_t n = write(2, &text, strlen(text));
-//     _exit(128+SIGILL);
-//     (void)n;
+//      _exit(128+SIGILL);
+//      (void)n;
 // }
 // __attribute__((constructor)) static void flow_crypto_cgo_init()
 // {   Fp temp = { 0 };
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index b2ea2654228..1936c151497 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -66,10 +66,7 @@ void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b);
 void Fr_squ_montg(Fr *res, const Fr *a);
 void Fr_to_montg(Fr *res, const Fr *a);
 void Fr_from_montg(Fr *res, const Fr *a);
-void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo,
-                  const int expo_len);
 void Fr_inv_montg_eucl(Fr *res, const Fr *a);
-void Fr_inv_exp_montg(Fr *res, const Fr *a);
 ERROR Fr_read_bytes(Fr *a, const byte *bin, int len);
 ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len);
 void Fr_write_bytes(byte *bin, const Fr *a);

From 8556f69ff882b80a28d8f3f57440416b8aa6f1a1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 22 Aug 2023 11:02:57 -0600
Subject: [PATCH 146/200] update README with BLST update steps

---
 crypto/bls12381_utils.c   |  2 +-
 crypto/blst_src/README.md | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 7e1afbf7fbf..b583462886b 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -22,7 +22,7 @@ const Fr BLS12_381_rR = {{
 
 // returns true if a == 0 and false otherwise
 bool Fr_is_zero(const Fr *a) {
-  return bytes_are_zero((const byte *)a, sizeof(Fr));
+  return vec_is_zero(a, sizeof(Fr));
 }
 
 // returns true if a == b and false otherwise
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 877c9db7ee5..1234169dbef 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -9,9 +9,21 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp
 
 The folder contains:
 - BLST LICENSE file
-- all <blst>/src/*.c and <blst>/src/*.h files (C source files)
-- all <blst>/build   (assembly generated files)
-- <blst>/bindings/blst.h  (headers of external functions)
-- <blst>/bindings/blst_aux.h (headers of external aux functions)
+- all `<blst>/src/*.c` and `<blst>/src/*.h` files (C source files) but `server.c`.
+- `server.c` is replaced by `blst_src.c` (which lists only the files needed by Flow crypto).
+- all `<blst>/build`   (assembly generated files).
+- `<blst>/bindings/blst.h`  (headers of external functions).
+- `<blst>/bindings/blst_aux.h` (headers of external aux functions).
+- this `README` file.
 
-TODO: add steps for upgrading the BLST version
\ No newline at end of file
+To upgrade the BLST version:
+- [ ] delete all files in this folder but `blst_src.c` and `README.md`.
+- [ ] open BLST repository on the new version.
+- [ ] copy all `.c` and `.h` files from `<blst>/src/` into this folder.
+- [ ] delete `server.c` from this folder.
+- [ ] copy the folder `<blst>/build/` into this folder.
+- [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into this folder.
+- [ ] solve all breaking changes that may occur.
+- [ ] update the commit version on this `README`.
+
+Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. 
\ No newline at end of file

From c1f294ce73284c49b5db4815a690a671c2e59e0b Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 24 Aug 2023 11:33:01 -0600
Subject: [PATCH 147/200] temp tmate debug and compile with asan

---
 .github/workflows/ci.yml | 2 ++
 crypto/bls12381_utils.go | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9bc0e30291e..30f9c107bb3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -69,6 +69,8 @@ jobs:
           cache: true
       - name: Run tidy
         run: make tidy
+      - name: Setup tmate session
+        uses: mxschmitt/action-tmate@v3
 
   create-dynamic-test-matrix:
     name: Create Dynamic Test Matrix
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index a972ca46b64..2a4e07c0d45 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,9 +4,10 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=address -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
+// #cgo LDFLAGS: -fsanitize=address
 // #include "bls12381_utils.h"
 //
 // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__))

From bacdb3393cfc22f81a190c089815dbfd2c057a9f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 24 Aug 2023 12:29:32 -0600
Subject: [PATCH 148/200] tmp

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 30f9c107bb3..e3cda99316f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,6 +38,8 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
     - name: Install C formatter
       run: sudo apt-get install -y clang-format
     - name: Run C formatter for ./crypto
@@ -69,8 +71,6 @@ jobs:
           cache: true
       - name: Run tidy
         run: make tidy
-      - name: Setup tmate session
-        uses: mxschmitt/action-tmate@v3
 
   create-dynamic-test-matrix:
     name: Create Dynamic Test Matrix

From c8b643ce2f1a7dddf95359bf1b0cd52d8562cd6e Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 24 Aug 2023 15:57:56 -0600
Subject: [PATCH 149/200] add new target for sanitization

---
 crypto/Makefile          | 13 +++++++++++++
 crypto/bls12381_utils.go |  3 +--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index be839a18118..2d50f0d1d75 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -38,6 +38,19 @@ c-format:
 	rm -f .clang-format
 	git diff --exit-code
 
+# sanitize C code
+# cannot run on macos
+.PHONY: c-sanitize
+c-format:
+# memory sanitization
+	$(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \
+	LD="-fsanitize=memory" go test \
+	if [ $$? -ne 0 ]; then exit 1; fi
+# address sanitization and other checks
+	$(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
+	LD="-fsanitize=address" go test \
+	if [ $$? -ne 0 ]; then exit 1; fi
+
 # Go tidy
 .PHONY: go-tidy
 go-tidy:
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 2a4e07c0d45..a972ca46b64 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,10 +4,9 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=address -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
-// #cgo LDFLAGS: -fsanitize=address
 // #include "bls12381_utils.h"
 //
 // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__))

From 62c1a166b326114a6ddb9829f86d75592b5e947c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 24 Aug 2023 16:44:44 -0600
Subject: [PATCH 150/200] add sanitizer to ci job

---
 crypto/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 2d50f0d1d75..7855284f9bb 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -41,9 +41,9 @@ c-format:
 # sanitize C code
 # cannot run on macos
 .PHONY: c-sanitize
-c-format:
+c-sanitize:
 # memory sanitization
-	$(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \
+	$(CGO_FLAG) CC="clang -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \
 	LD="-fsanitize=memory" go test \
 	if [ $$? -ne 0 ]; then exit 1; fi
 # address sanitization and other checks

From e557e8beebaa5301efe0be3d0af8543096eb25a8 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 28 Aug 2023 20:06:46 -0600
Subject: [PATCH 151/200] add more sanitization flags and restrict sanitization
 to linux

---
 crypto/Makefile | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 7855284f9bb..51454b801d8 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -3,6 +3,9 @@ COVER_PROFILE := cover.out
 
 IMAGE_TAG := v0.0.7
 
+# OS
+UNAME := $(shell uname -s)
+
 # allows CI to specify whether to have race detection on / off
 ifeq ($(RACE_DETECTOR),1)
 	RACE_FLAG := -race
@@ -11,7 +14,7 @@ else
 endif
 
 # `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
-ifeq ($(shell uname -s),Linux)
+ifeq ($(UNAME),Linux)
 # detect ADX support on the CURRENT linux machine.
 	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
 else
@@ -40,16 +43,21 @@ c-format:
 
 # sanitize C code
 # cannot run on macos
-.PHONY: c-sanitize
+.SILENT: c-sanitize
 c-sanitize:
-# memory sanitization
-	$(CGO_FLAG) CC="clang -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \
-	LD="-fsanitize=memory" go test \
-	if [ $$? -ne 0 ]; then exit 1; fi
-# address sanitization and other checks
-	$(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
-	LD="-fsanitize=address" go test \
-	if [ $$? -ne 0 ]; then exit 1; fi
+# - memory sanitization (only on linux and using clang) - (could use go test -msan)
+# - address sanitization and other checks (only on linux)
+	if [ $(UNAME) = "Linux" ]; then \
+		$(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \
+		LD="-fsanitize=memory" go test; \
+		if [ $$? -ne 0 ]; then exit 1; fi; \
+		\
+		$(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
+		LD="-fsanitize=address -fsanitize=leak" go test; \
+		if [ $$? -ne 0 ]; then exit 1; fi; \
+	else \
+		echo "sanitization is only supported on Linux"; \
+	fi; \
 
 # Go tidy
 .PHONY: go-tidy

From f8bc02b9d63580a0d832e4e6eb6a9be0d8f32b0f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 28 Aug 2023 20:07:22 -0600
Subject: [PATCH 152/200] add sanitization to ci

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e3cda99316f..21fdbd7834e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -42,8 +42,8 @@ jobs:
       uses: mxschmitt/action-tmate@v3
     - name: Install C formatter
       run: sudo apt-get install -y clang-format
-    - name: Run C formatter for ./crypto
-      run: make -C crypto c-format
+    - name: Run C formatter and sanitizer for ./crypto
+      run: make -C crypto c-format && make -C crypto c-sanitize
     - name: Run go generate
       run: go generate
       working-directory: ${{ matrix.dir }}

From d58b172fa8710807d7682451f08734402e35ef43 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 28 Aug 2023 20:07:58 -0600
Subject: [PATCH 153/200] disable sanitization for E1_write_bytes because of
 false positive

---
 crypto/bls12381_utils.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index b583462886b..653935c197f 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -168,7 +168,7 @@ ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) {
 
 // write Fr element `a` in big endian bytes.
 void Fr_write_bytes(byte *bin, const Fr *a) {
-  // be_bytes_from_limbs works for both limb endiannesses
+  // be_bytes_from_limbs works for both limb endianness types
   be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES);
 }
 
@@ -302,7 +302,8 @@ ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) {
   return VALID;
 }
 
-// write Fp element to bin and assume `bin` has  `Fp_BYTES` allocated bytes.
+// write Fp element to `bin`,
+// assuming `bin` has  `Fp_BYTES` allocated bytes.
 void Fp_write_bytes(byte *bin, const Fp *a) {
   be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES);
 }
@@ -523,8 +524,8 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
   Fp_squ_montg(&a->y, &a->x);
   Fp_mul_montg(&a->y, &a->y, &a->x); // x^3
   Fp_add(&a->y, &a->y, &B_E1);       // B_E1 is already in Montg form
-  if (!Fp_sqrt_montg(&a->y,
-                     &a->y)) { // check whether x^3+b is a quadratic residue
+  // check whether x^3+b is a quadratic residue
+  if (!Fp_sqrt_montg(&a->y, &a->y)) {
     return POINT_NOT_ON_CURVE;
   }
 
@@ -539,7 +540,13 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
 // uncompressed form. It assumes buffer is of length G1_SER_BYTES The
 // serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
+#if defined(__has_feature) && __has_feature(memory_sanitizer)
+// disable memory sanitization in this function because of a use-of-uninitialized-value
+// false positive.
+void __attribute__((no_sanitize("memory"))) E1_write_bytes(byte *bin, const E1 *a) {
+#else
 void E1_write_bytes(byte *bin, const E1 *a) {
+#endif
   if (E1_is_infty(a)) {
     // set the infinity bit
     bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);

From e280664405f7ae9c9dfec8d338bc293933a5206c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 29 Aug 2023 13:45:39 -0600
Subject: [PATCH 154/200] split c-sanitize and disable msan from CI - add
 NO_MSAN macro

---
 crypto/Makefile         | 36 +++++++++++++++++++++++++++---------
 crypto/bls12381_utils.c |  8 +-------
 crypto/bls12381_utils.h | 19 +++++++++++++++++--
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 51454b801d8..c58c0f55635 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -41,17 +41,11 @@ c-format:
 	rm -f .clang-format
 	git diff --exit-code
 
-# sanitize C code
-# cannot run on macos
-.SILENT: c-sanitize
-c-sanitize:
-# - memory sanitization (only on linux and using clang) - (could use go test -msan)
+# address sanitization and other checks
+.SILENT: c-asan
+c-asan:
 # - address sanitization and other checks (only on linux)
 	if [ $(UNAME) = "Linux" ]; then \
-		$(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \
-		LD="-fsanitize=memory" go test; \
-		if [ $$? -ne 0 ]; then exit 1; fi; \
-		\
 		$(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
 		LD="-fsanitize=address -fsanitize=leak" go test; \
 		if [ $$? -ne 0 ]; then exit 1; fi; \
@@ -59,6 +53,30 @@ c-sanitize:
 		echo "sanitization is only supported on Linux"; \
 	fi; \
 
+# memory sanitization
+.SILENT: c-msan
+c-msan:
+# - memory sanitization (only on linux and using clang) - (could use go test -msan)
+# currently, this leads to many false positives, most likely because of assembly code not handled properly
+# by asan. If you would like to run this command, you can use `NO_MSAN` to diable msan in some C functions.
+# For instance "void NO_MSAN f() {...}" disables msan in function f. `NO_MSAN` is already defined in
+# bls12381_utils.h
+	if [ $(UNAME) = "Linux" ]; then \
+		$(CGO_FLAG) CC="clang -DMSAN -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \
+		LD="-fsanitize=memory" go test; \
+		if [ $$? -ne 0 ]; then exit 1; fi; \
+	else \
+		echo "sanitization is only supported on Linux"; \
+	fi; \
+
+# sanitize C code
+.SILENT: c-sanitize
+c-sanitize: c-asan
+# - address sanitization and other checks (only on linux)
+# - memory sanitization (target m-san) is disabled because of multiple false positives
+
+
+
 # Go tidy
 .PHONY: go-tidy
 go-tidy:
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 653935c197f..25bffcd6bd8 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -540,13 +540,7 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
 // uncompressed form. It assumes buffer is of length G1_SER_BYTES The
 // serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-#if defined(__has_feature) && __has_feature(memory_sanitizer)
-// disable memory sanitization in this function because of a use-of-uninitialized-value
-// false positive.
-void __attribute__((no_sanitize("memory"))) E1_write_bytes(byte *bin, const E1 *a) {
-#else
 void E1_write_bytes(byte *bin, const E1 *a) {
-#endif
   if (E1_is_infty(a)) {
     // set the infinity bit
     bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);
@@ -1063,7 +1057,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst,
 }
 
 // DEBUG printing functions
-#if (DEBUG == 1)
+#ifdef DEBUG 
 void bytes_print_(char *s, byte *data, int len) {
   if (strlen(s))
     printf("[%s]:\n", s);
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 1936c151497..2e6f39bd0d5 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -129,8 +129,8 @@ void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int);
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
 
 // Debugging related functions
-#define DEBUG 1
-#if (DEBUG == 1)
+// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test 
+#ifdef DEBUG
 #include <stdio.h>
 void bytes_print_(char *, byte *, int);
 void Fr_print_(char *, Fr *);
@@ -139,6 +139,21 @@ void Fp2_print_(char *, const Fp2 *);
 void Fp12_print_(char *, const Fp12 *);
 void E1_print_(char *, const E1 *, const int);
 void E2_print_(char *, const E2 *, const int);
+
 #endif /* DEBUG */
 
+// memory sanitization disabler
+#define NO_MSAN
+#ifdef MSAN
+/* add NO_MSAN to a function defintion to disable MSAN in that function ( void NO_MSAN f(..) {} ) */
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+// disable memory sanitization in this function because of a use-of-uninitialized-value
+// false positive.
+#undef NO_MSAN
+#define NO_MSAN __attribute__((no_sanitize("memory")))
+#endif /* __has_feature(memory_sanitizer) */
+#endif /* __has_feature*/
+#endif /*MSAN*/
+
 #endif /* BLS12_381_UTILS */
\ No newline at end of file

From aa8d79eb0c1702b748960d342b18e464b10f3f4b Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 29 Aug 2023 13:52:30 -0600
Subject: [PATCH 155/200] disable tmate and format

---
 crypto/bls12381_utils.c | 6 ++----
 crypto/bls12381_utils.h | 9 +++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 25bffcd6bd8..0f158055fd8 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -21,9 +21,7 @@ const Fr BLS12_381_rR = {{
 }};
 
 // returns true if a == 0 and false otherwise
-bool Fr_is_zero(const Fr *a) {
-  return vec_is_zero(a, sizeof(Fr));
-}
+bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); }
 
 // returns true if a == b and false otherwise
 bool Fr_is_equal(const Fr *a, const Fr *b) {
@@ -1057,7 +1055,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst,
 }
 
 // DEBUG printing functions
-#ifdef DEBUG 
+#ifdef DEBUG
 void bytes_print_(char *s, byte *data, int len) {
   if (strlen(s))
     printf("[%s]:\n", s);
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 2e6f39bd0d5..fed426eb997 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -129,7 +129,7 @@ void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int);
 void xmd_sha256(byte *, int, byte *, int, byte *, int);
 
 // Debugging related functions
-// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test 
+// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test
 #ifdef DEBUG
 #include <stdio.h>
 void bytes_print_(char *, byte *, int);
@@ -145,11 +145,12 @@ void E2_print_(char *, const E2 *, const int);
 // memory sanitization disabler
 #define NO_MSAN
 #ifdef MSAN
-/* add NO_MSAN to a function defintion to disable MSAN in that function ( void NO_MSAN f(..) {} ) */
+/* add NO_MSAN to a function defintion to disable MSAN in that function ( void
+ * NO_MSAN f(..) {} ) */
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-// disable memory sanitization in this function because of a use-of-uninitialized-value
-// false positive.
+// disable memory sanitization in this function because of a
+// use-of-uninitialized-value false positive.
 #undef NO_MSAN
 #define NO_MSAN __attribute__((no_sanitize("memory")))
 #endif /* __has_feature(memory_sanitizer) */

From b2302c96a7d931d8366a00d13b756bb050a20389 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 29 Aug 2023 14:03:49 -0600
Subject: [PATCH 156/200] add missing change

---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 21fdbd7834e..c0fd71d4030 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,8 +38,6 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Setup tmate session
-      uses: mxschmitt/action-tmate@v3
     - name: Install C formatter
       run: sudo apt-get install -y clang-format
     - name: Run C formatter and sanitizer for ./crypto

From ceab7e0fc31960ebc6332a4a0a555b2b58e0ab5a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 29 Aug 2023 14:09:40 -0600
Subject: [PATCH 157/200] fix asan command

---
 crypto/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index c58c0f55635..43aae8ef39f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -46,7 +46,7 @@ c-format:
 c-asan:
 # - address sanitization and other checks (only on linux)
 	if [ $(UNAME) = "Linux" ]; then \
-		$(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
+		$(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \
 		LD="-fsanitize=address -fsanitize=leak" go test; \
 		if [ $$? -ne 0 ]; then exit 1; fi; \
 	else \

From 8314b0cb0b01cd55dbdb669d1fcdbd4ac5c6953d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 29 Aug 2023 18:23:01 -0600
Subject: [PATCH 158/200] more details about updating BLST version

---
 crypto/blst_src/README.md  | 4 +++-
 crypto/blst_src/blst_src.c | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 1234169dbef..f6adff64fea 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -17,11 +17,13 @@ The folder contains:
 - this `README` file.
 
 To upgrade the BLST version:
-- [ ] delete all files in this folder but `blst_src.c` and `README.md`.
+- [ ] delete all files in this folder (`./blst_src`) but `blst_src.c` and `README.md`.
 - [ ] open BLST repository on the new version.
 - [ ] copy all `.c` and `.h` files from `<blst>/src/` into this folder.
 - [ ] delete `server.c` from this folder.
+- [ ] update `blst_src.c` if needed.
 - [ ] copy the folder `<blst>/build/` into this folder.
+- [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`.
 - [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into this folder.
 - [ ] solve all breaking changes that may occur.
 - [ ] update the commit version on this `README`.
diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c
index b904a5d52ee..a50649e5788 100644
--- a/crypto/blst_src/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -1,3 +1,8 @@
+// This file contains all BLST lib C files needed for
+// Flow crypto.
+// 
+// The list may need to be updated in a new version of BLST is used. 
+
 #include "keygen.c"
 #include "hash_to_field.c"
 #include "e1.c"

From 4adb5cfa9ad2e28fd6d937455b73289b11ff80e9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 1 Sep 2023 18:31:05 -0600
Subject: [PATCH 159/200] minor macro improvement

---
 crypto/bls12381_utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index fed426eb997..2594786ad36 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -44,12 +44,12 @@ typedef enum {
 #define G2_BYTES (2 * Fp2_BYTES)
 
 // Compressed and uncompressed points
-#define COMPRESSED 1
 #define UNCOMPRESSED 0
+#define COMPRESSED (UNCOMPRESSED^1)
 #define G1_SERIALIZATION (COMPRESSED)
 #define G2_SERIALIZATION (COMPRESSED)
-#define G1_SER_BYTES (G1_BYTES / (G1_SERIALIZATION + 1))
-#define G2_SER_BYTES (G2_BYTES / (G2_SERIALIZATION + 1))
+#define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2))
+#define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2))
 
 // Fr utilities
 extern const Fr BLS12_381_rR;

From e6b29bc8e83c673d68373d53cdf0962778a32a1c Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 1 Sep 2023 18:55:42 -0600
Subject: [PATCH 160/200] add types sanity check in init()

---
 crypto/bls12381_utils.c  | 7 +++++++
 crypto/bls12381_utils.go | 4 +++-
 crypto/bls12381_utils.h  | 3 +++
 crypto/blst_include.h    | 2 +-
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 0f158055fd8..69ce2ba9c2f 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -9,6 +9,13 @@
 // compile all blst C src along with this file
 #include "blst_src.c"
 
+// make sure flow crypto types are consistent with BLST types
+void types_sanity(void) {
+  assert(sizeof(Fp)==sizeof(vec384));
+  assert(sizeof(E1)==sizeof(POINTonE1));
+  assert(sizeof(E2)==sizeof(POINTonE2));
+}
+
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index a972ca46b64..e9a72b6a5e5 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,7 +4,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros -Wno-unused-variable
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
@@ -71,6 +71,8 @@ var g2PublicKey pubKeyBLSBLS12381
 
 // initialization of BLS12-381 curve
 func initBLS12381() {
+	C.types_sanity()
+
 	if isG1Compressed() {
 		g1SerHeader = 0xC0
 	} else {
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 2594786ad36..134dd21bdc6 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -51,6 +51,9 @@ typedef enum {
 #define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2))
 #define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2))
 
+// init-related functions
+void types_sanity(void);
+
 // Fr utilities
 extern const Fr BLS12_381_rR;
 bool Fr_is_zero(const Fr *a);
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 1f0ed3b17ce..dc942b5976b 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -31,7 +31,7 @@ typedef vec384 Fp;
 // curve E_1 (over F_p)
 // E_1 points are represented in Jacobian coordinates (x,y,z),
 // where x, y, x are elements of F_p (type `Fp`).
-// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian
+// `E1` is equivalent to type `POINTonE1` (used internally by BLST for Jacobian
 // E1 elements) `E1` is defined as a struct to be exportable through cgo to the
 // Go layer. `E1` is also used to represent all subgroup G_1 elements.
 typedef struct {

From 5305b28943981642d0c253d93768e36a73f57443 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 1 Sep 2023 19:34:55 -0600
Subject: [PATCH 161/200] add affine conversions for potential public keys that
 can be used in muliple pairings

---
 crypto/bls12381_utils.c    | 31 ++++++++++++++++++++++++++-----
 crypto/bls12381_utils.go   |  7 ++++++-
 crypto/bls12381_utils.h    |  3 ++-
 crypto/bls_multisig.go     |  2 +-
 crypto/dkg_jointfeldman.go |  4 ++--
 5 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 69ce2ba9c2f..e040d018024 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -708,6 +708,8 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2;
 // E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or
 // uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2
 // check is included).
+// E2 point is in affine coordinates. This avoids further conversions
+// when the point is used in multiple pairing computation.
 //
 // returns:
 //    - BAD_ENCODING if the length is invalid or serialization header bits are
@@ -878,7 +880,7 @@ void E2_add(E2 *res, const E2 *a, const E2 *b) {
 }
 
 // generic point double that must handle point at infinity
-void E2_double(E2 *res, const E2 *a) {
+static void E2_double(E2 *res, const E2 *a) {
   POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a);
 }
 
@@ -934,6 +936,15 @@ void G2_mult_gen(E2 *res, const Fr *expo) {
   vec_zero(&tmp, sizeof(tmp));
 }
 
+// Exponentiation of generator g2 of G2, res = expo.g2
+//
+// This is useful for results being used multiple times in pairings.
+// Conversion to affine saves later pre-pairing conversions.
+void G2_mult_gen_to_affine(E2 *res, const Fr *expo) {
+  G2_mult_gen(res, expo);
+  E2_to_affine(res, res);
+}
+
 // checks if input E2 point is on the subgroup G2.
 // It assumes input `p` is on E2.
 bool E2_in_G2(const E2 *p) {
@@ -949,6 +960,16 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int len) {
   }
 }
 
+// computes the sum of the E2 array elements `y[i]`, converts it
+// to affine coordinates, and writes it in `sum`.
+//
+// This is useful for results being used multiple times in pairings.
+// Conversion to affine saves later pre-pairing conversions.
+void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int len) {
+  E2_sum_vector(sum, y, len);
+  E2_to_affine(sum, sum);
+}
+
 // Subtracts all G2 array elements `y` from an element `x` and writes the
 // result in res
 void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len) {
@@ -1014,7 +1035,7 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
       continue;
     }
     // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array.
-    // `POINTonEx_affine` has a different size than `POINTonEx` or `Ex` !
+    // `POINTonEx_affine` has a different size than `POINTonEx` and `Ex` !
     E1 tmp1;
     E1_to_affine(&tmp1, p + i);
     vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine));
@@ -1022,7 +1043,7 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
     E2_to_affine(&tmp2, q + i);
     vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine));
     n++;
-    if (n == N_MAX) { // if p_ and q_ are filled, batch `N_MAX` miller loops
+    if (n == N_MAX) { // if p_aff and q_aff are filled, batch `N_MAX` miller loops
       if (!init_flag) {
         miller_loop_n(res_vec, q_aff, p_aff, N_MAX);
         init_flag = 1;
@@ -1034,8 +1055,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
       n = 0;
     }
   }
-  // if p_ and q_ aren't empty,
-  // remaining couples are also batched in `n` miller loops
+  // if p_aff and q_aff aren't empty,
+  // the remaining couples are also batched in `n` miller loops
   if (n > 0) {
     if (!init_flag) {
       miller_loop_n(res_vec, q_aff, p_aff, n);
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index e9a72b6a5e5..21d9e13af05 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -112,8 +112,13 @@ func generatorScalarMultG1(res *pointE1, expo *scalar) {
 }
 
 // Scalar multiplication of generator g2 in G2
+//
+// This often results in a public key that is used in
+// multiple pairing computation. Therefore, convert the
+// resulting point to affine coordinate to save pre-pairing
+// conversions.
 func generatorScalarMultG2(res *pointE2, expo *scalar) {
-	C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo))
+	C.G2_mult_gen_to_affine((*C.E2)(res), (*C.Fr)(expo))
 }
 
 // comparison in Fr where r is the group order of G1/G2
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 134dd21bdc6..1098144fd7c 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -112,12 +112,13 @@ void E2_to_affine(E2 *, const E2 *);
 ERROR E2_read_bytes(E2 *, const byte *, const int);
 void E2_write_bytes(byte *, const E2 *);
 void G2_mult_gen(E2 *, const Fr *);
+void G2_mult_gen_to_affine(E2 *, const Fr *);
 void E2_mult(E2 *, const E2 *, const Fr *);
 void E2_mult_small_expo(E2 *, const E2 *, const byte);
 void E2_add(E2 *res, const E2 *a, const E2 *b);
-void E2_double(E2 *res, const E2 *a);
 void E2_neg(E2 *, const E2 *);
 void E2_sum_vector(E2 *, const E2 *, const int);
+void E2_sum_vector_to_affine(E2 *, const E2 *, const int);
 void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len);
 bool E2_in_G2(const E2 *);
 void unsafe_map_bytes_to_G2(E2 *, const byte *, int);
diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go
index 7f57cd09888..2567aaf2ba0 100644
--- a/crypto/bls_multisig.go
+++ b/crypto/bls_multisig.go
@@ -184,7 +184,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) {
 	}
 
 	var sum pointE2
-	C.E2_sum_vector((*C.E2)(&sum), (*C.E2)(&points[0]),
+	C.E2_sum_vector_to_affine((*C.E2)(&sum), (*C.E2)(&points[0]),
 		(C.int)(len(points)))
 
 	sumKey := newPubKeyBLSBLS12381(&sum)
diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go
index 40db316efb5..115730e33d9 100644
--- a/crypto/dkg_jointfeldman.go
+++ b/crypto/dkg_jointfeldman.go
@@ -302,12 +302,12 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointE2
 		(C.int)(qualified))
 	// sum up Y
 	var jointPublicKey pointE2
-	C.E2_sum_vector((*C.E2)(&jointPublicKey),
+	C.E2_sum_vector_to_affine((*C.E2)(&jointPublicKey),
 		(*C.E2)(&qualifiedPubKey[0]), (C.int)(qualified))
 	// sum up []y
 	jointy := make([]pointE2, s.size)
 	for i := 0; i < s.size; i++ {
-		C.E2_sum_vector((*C.E2)(&jointy[i]),
+		C.E2_sum_vector_to_affine((*C.E2)(&jointy[i]),
 			(*C.E2)(&qualifiedy[i][0]), (C.int)(qualified))
 	}
 	return &jointx, &jointPublicKey, jointy

From afa9f240949a1180e49447a1ce06bb458e44bc00 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Sep 2023 19:45:17 -0600
Subject: [PATCH 162/200] clarify some TODOs

---
 crypto/README.md         |  3 ---
 crypto/bls.go            |  3 ---
 crypto/bls12381_utils.c  | 36 ++++++++++++++++++++----------------
 crypto/dkg_core.c        | 10 ++++++----
 crypto/dkg_feldmanvss.go |  2 +-
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/crypto/README.md b/crypto/README.md
index 97156fa52c9..c15d0a36462 100644
--- a/crypto/README.md
+++ b/crypto/README.md
@@ -62,9 +62,6 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey`
     public keys, using a binary tree of aggregations.
     * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier.
 
- * Future features:
-    * support minimal-pubkey-size variant
-
 ### PRNG
 
  * ChaCha20-based CSPRNG
diff --git a/crypto/bls.go b/crypto/bls.go
index b5ed13bd83d..c6f01a6ab28 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -23,9 +23,6 @@ package crypto
 //  - SPoCK scheme based on BLS: verifies two signatures are generated from the same message,
 //    even though the message is unknown to the verifier.
 
-// future features:
-//  - implement a G1/G2 swap (minimal-pubkey-size variant)
-
 // #include "bls_include.h"
 import "C"
 
diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index e040d018024..14d98869847 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -140,16 +140,15 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) {
   if (len != Fr_BYTES) {
     return BAD_ENCODING;
   }
+  // compare to r using the BLST tool 
   pow256 tmp;
-  // compare to r using the provided tool from BLST
-  pow256_from_be_bytes(tmp, bin); // TODO: check endianness!!
-  if (!check_mod_256(
-          tmp,
-          BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256!
+  pow256_from_be_bytes(tmp, bin);
+  // (check_mod_256 compares pow256 against a vec256!)
+  if (!check_mod_256(tmp, BLS12_381_r)) { 
     return BAD_VALUE;
   }
   vec_zero(tmp, sizeof(tmp));
-  limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES); // TODO: check endianness!!
+  limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES);
   return VALID;
 }
 
@@ -177,11 +176,16 @@ void Fr_write_bytes(byte *bin, const Fr *a) {
   be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES);
 }
 
-// maps big-endian bytes into an Fr element using modular reduction
-// Input is byte-big-endian, output is Fr (internally vec256)
-// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t
-// n0);
+// maps big-endian bytes of any size into an Fr element using modular reduction.
+// Input is byte-big-endian, output is Fr (internally vec256).
+//
+// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t
+// n0) to reduce 512 bits at a time.
 static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
+  // input can be written in base 2^|R|, with R the Montgomery constant
+  // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1))
+  // Therefore N mod p can be expressed using R as:
+  // N mod p = l_1 + L_2*R .. + L_n*R^(n-1)
   Fr digit, radix;
   Fr_set_zero(out);
   Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2
@@ -200,7 +204,7 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
   limbs_from_be_bytes((limb_t *)&digit, p - n, n);
   Fr_mul_montg(&digit, &digit, &radix);
   Fr_add(out, out, &digit);
-  // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n
+  // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n,
   // reduce the extra R
   Fr_from_montg(out, out);
   // clean up possible sensitive data
@@ -463,8 +467,8 @@ bool E1_in_G1(const E1 *p) {
 //    - POINT_NOT_ON_CURVE if deserialized point isn't on E1
 //    - VALID if deserialization is valid
 
-// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z,
-//       and update logic with G2 subgroup check?
+// Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z,
+//       but needs to update the logic around G2 subgroup check
 ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
   // check the length
   if (len != G1_SER_BYTES) {
@@ -717,9 +721,9 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2;
 //    - BAD_VALUE if Fp^2 coordinates couldn't deserialize
 //    - POINT_NOT_ON_CURVE if deserialized point isn't on E2
 //    - VALID if deserialization is valid
-
-// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z,
-//       and update logic with G2 subgroup check?
+//
+// Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z,
+//       and update the logic around G2 subgroup check.
 ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
   // check the length
   if (len != G2_SER_BYTES) {
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 811f9c84653..e5b3bd5d333 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -65,11 +65,13 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
   }
 }
 
-// The function imports an array of E2 points from a concatenated array of
-// bytes. The bytes array is supposed to be in (len * G2_SER_BYTES)
-ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len) {
+// The function imports an array of `n` E2 points from a concatenated array of
+// bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES).
+// 
+// If return is `VALID`, output vector is guaranteed to be in E2.
+ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int n) {
   byte *p = (byte *)src;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < n; i++) {
     int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
     if (read_ret != VALID)
       return read_ret;
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 0de83b43dc2..36f486945e7 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -398,7 +398,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) {
 
 // receives the public vector from the
 func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
-	// only accept the verification vector from the .
+	// only accept the verification vector from the dealer.
 	if origin != s.dealerIndex {
 		return
 	}

From 76850aabf765b81578c7913dfeac6c6f28d20f83 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Sep 2023 19:54:02 -0600
Subject: [PATCH 163/200] DKG's readVector enforces A to be in G2

---
 crypto/dkg_core.c        | 19 ++++++++++++++++---
 crypto/dkg_feldmanvss.go |  8 ++++----
 crypto/dkg_include.h     |  2 +-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index e5b3bd5d333..af9aac9a560 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -68,13 +68,26 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
 // The function imports an array of `n` E2 points from a concatenated array of
 // bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES).
 // 
-// If return is `VALID`, output vector is guaranteed to be in E2.
-ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int n) {
+// If return is `VALID`, output vector is guaranteed to be in G2.
+// It returns other errors if at least one input isn't a serialization of a E2 
+// point, or an input E2 point isn't in G2.
+// returns:
+//    - BAD_ENCODING if the serialization header bits of at least one input are invalid.
+//    - BAD_VALUE if Fp^2 coordinates of at least one input couldn't deserialize.
+//    - POINT_NOT_ON_CURVE if  at least one input deserialized point isn't on E2.
+//    - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2.
+//    - VALID if deserialization of all points to G2 is valid.
+ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) {
   byte *p = (byte *)src;
   for (int i = 0; i < n; i++) {
     int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
-    if (read_ret != VALID)
+    if (read_ret != VALID) {
       return read_ret;
+    }
+    if (!E2_in_G2(&A[i])) {
+      return POINT_NOT_IN_GROUP;
+    }
+
     p += G2_SER_BYTES;
   }
   // TODO: add G2 subgroup check?
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 36f486945e7..c89bee98ea1 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -457,10 +457,10 @@ func writeVerifVector(dest []byte, A []pointE2) {
 	)
 }
 
-// readVerifVector imports A vector from an array of bytes,
-// assuming the slice length matches the vector length
+// readVerifVector imports A vector (G2 points) from an array of bytes,
+// assuming the slice length matches the vector length.
 func readVerifVector(A []pointE2, src []byte) error {
-	read := C.E2_vector_read_bytes(
+	read := C.G2_vector_read_bytes(
 		(*C.E2)(&A[0]),
 		(*C.uchar)(&src[0]),
 		(C.int)(len(A)))
@@ -468,7 +468,7 @@ func readVerifVector(A []pointE2, src []byte) error {
 		return nil
 	}
 	// invalid A vector
-	return invalidInputsErrorf("the verifcation vector does not serialize valid E2 points: error code %d", read)
+	return invalidInputsErrorf("the verification vector does not serialize valid G2 points: error code %d", read)
 }
 
 func (s *feldmanVSSstate) verifyShare() bool {
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index fc377f26b4f..05d46187749 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -9,7 +9,7 @@ void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg,
                          const byte x);
 void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg);
 void G2_vector_write_bytes(byte *out, const E2 *A, const int len);
-ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len);
+ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len);
 bool G2_check_log(const Fr *x, const E2 *y);
 
 #endif

From 589d8d78c39f5967c8743d51cf2c7f8d585eb5b1 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Sep 2023 19:58:08 -0600
Subject: [PATCH 164/200] format

---
 crypto/bls12381_utils.c | 17 +++++++++--------
 crypto/bls12381_utils.h |  8 +++++---
 crypto/dkg_core.c       | 13 ++++++++-----
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 14d98869847..e4636aad457 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -11,9 +11,9 @@
 
 // make sure flow crypto types are consistent with BLST types
 void types_sanity(void) {
-  assert(sizeof(Fp)==sizeof(vec384));
-  assert(sizeof(E1)==sizeof(POINTonE1));
-  assert(sizeof(E2)==sizeof(POINTonE2));
+  assert(sizeof(Fp) == sizeof(vec384));
+  assert(sizeof(E1) == sizeof(POINTonE1));
+  assert(sizeof(E2) == sizeof(POINTonE2));
 }
 
 // ------------------- Fr utilities
@@ -140,11 +140,11 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) {
   if (len != Fr_BYTES) {
     return BAD_ENCODING;
   }
-  // compare to r using the BLST tool 
+  // compare to r using the BLST tool
   pow256 tmp;
   pow256_from_be_bytes(tmp, bin);
   // (check_mod_256 compares pow256 against a vec256!)
-  if (!check_mod_256(tmp, BLS12_381_r)) { 
+  if (!check_mod_256(tmp, BLS12_381_r)) {
     return BAD_VALUE;
   }
   vec_zero(tmp, sizeof(tmp));
@@ -179,8 +179,8 @@ void Fr_write_bytes(byte *bin, const Fr *a) {
 // maps big-endian bytes of any size into an Fr element using modular reduction.
 // Input is byte-big-endian, output is Fr (internally vec256).
 //
-// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t
-// n0) to reduce 512 bits at a time.
+// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p,
+// limb_t n0) to reduce 512 bits at a time.
 static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
   // input can be written in base 2^|R|, with R the Montgomery constant
   // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1))
@@ -1047,7 +1047,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
     E2_to_affine(&tmp2, q + i);
     vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine));
     n++;
-    if (n == N_MAX) { // if p_aff and q_aff are filled, batch `N_MAX` miller loops
+    // if p_aff and q_aff are filled, batch `N_MAX` miller loops
+    if (n == N_MAX) {
       if (!init_flag) {
         miller_loop_n(res_vec, q_aff, p_aff, N_MAX);
         init_flag = 1;
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index 1098144fd7c..b0f96669ed7 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -45,11 +45,13 @@ typedef enum {
 
 // Compressed and uncompressed points
 #define UNCOMPRESSED 0
-#define COMPRESSED (UNCOMPRESSED^1)
+#define COMPRESSED (UNCOMPRESSED ^ 1)
 #define G1_SERIALIZATION (COMPRESSED)
 #define G2_SERIALIZATION (COMPRESSED)
-#define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2))
-#define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2))
+#define G1_SER_BYTES                                                           \
+  (G1_SERIALIZATION == UNCOMPRESSED ? G1_BYTES : (G1_BYTES / 2))
+#define G2_SER_BYTES                                                           \
+  (G2_SERIALIZATION == UNCOMPRESSED ? G2_BYTES : (G2_BYTES / 2))
 
 // init-related functions
 void types_sanity(void);
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index af9aac9a560..f7521aa5ac7 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -67,14 +67,17 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
 
 // The function imports an array of `n` E2 points from a concatenated array of
 // bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES).
-// 
+//
 // If return is `VALID`, output vector is guaranteed to be in G2.
-// It returns other errors if at least one input isn't a serialization of a E2 
+// It returns other errors if at least one input isn't a serialization of a E2
 // point, or an input E2 point isn't in G2.
 // returns:
-//    - BAD_ENCODING if the serialization header bits of at least one input are invalid.
-//    - BAD_VALUE if Fp^2 coordinates of at least one input couldn't deserialize.
-//    - POINT_NOT_ON_CURVE if  at least one input deserialized point isn't on E2.
+//    - BAD_ENCODING if the serialization header bits of at least one input are
+//    invalid.
+//    - BAD_VALUE if Fp^2 coordinates of at least one input couldn't
+//    deserialize.
+//    - POINT_NOT_ON_CURVE if  at least one input deserialized point isn't on
+//    E2.
 //    - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2.
 //    - VALID if deserialization of all points to G2 is valid.
 ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) {

From 14c9e3de1093a7d7e7245d05f18073490f2f17c7 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 11:57:54 -0600
Subject: [PATCH 165/200] clean up c flags and add instruction to readme

---
 crypto/bls12381_utils.go    | 2 +-
 crypto/bls_thresholdsign.go | 1 -
 crypto/blst_src/README.md   | 1 +
 crypto/dkg_core.c           | 2 --
 crypto/dkg_feldmanvss.go    | 1 -
 crypto/dkg_feldmanvssq.go   | 1 -
 6 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 21d9e13af05..a3867b31b20 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -4,7 +4,7 @@ package crypto
 // these tools are shared by the BLS signature scheme, the BLS based threshold signature
 // and the BLS distributed key generation protocols
 
-// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros -Wno-unused-variable
+// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -Wall -fno-builtin-memcpy -fno-builtin-memset -Wno-unused-function -Wno-unused-macros -Wno-unused-variable
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "bls12381_utils.h"
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 9451f4fb6dc..83fb6d6949f 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -1,6 +1,5 @@
 package crypto
 
-// #cgo CFLAGS:
 // #include "bls_thresholdsign_include.h"
 import "C"
 
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index f6adff64fea..d283b4dd6c4 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -25,6 +25,7 @@ To upgrade the BLST version:
 - [ ] copy the folder `<blst>/build/` into this folder.
 - [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`.
 - [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into this folder.
+- [ ] check that C flags in `./bls12381_utils.go` still match the C flags in `<blst>/bindings/go/blst.go`.
 - [ ] solve all breaking changes that may occur.
 - [ ] update the commit version on this `README`.
 
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index f7521aa5ac7..f5f48db67ae 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -90,10 +90,8 @@ ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) {
     if (!E2_in_G2(&A[i])) {
       return POINT_NOT_IN_GROUP;
     }
-
     p += G2_SER_BYTES;
   }
-  // TODO: add G2 subgroup check?
   return VALID;
 }
 
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index c89bee98ea1..2814e59ee14 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -1,6 +1,5 @@
 package crypto
 
-// #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
 
diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go
index b8056b990dc..c3aca992ee2 100644
--- a/crypto/dkg_feldmanvssq.go
+++ b/crypto/dkg_feldmanvssq.go
@@ -1,6 +1,5 @@
 package crypto
 
-// #cgo CFLAGS:
 // #include "dkg_include.h"
 import "C"
 

From 12e338e8bf2f1110648159e4eef15afede554914 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 13:18:57 -0600
Subject: [PATCH 166/200] c format

---
 crypto/bls_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index d221f4c2237..aac7d60ee18 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -391,7 +391,8 @@ static void bls_batch_verify_tree(const node *root, const int len,
 //  indices mixup.
 // - optimize the verification by verifying an aggregated signature against an
 // aggregated
-//  public key, and use a top-down recursive verification to find invalid signatures.
+//  public key, and use a top-down recursive verification to find invalid
+//  signatures.
 void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
                       const byte *sigs_bytes, const byte *data,
                       const int data_len, const byte *seed) {

From 7a268aeea938f0aa2e9f47130595b55b727f9931 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 13:20:16 -0600
Subject: [PATCH 167/200] mod tidy

---
 go.sum | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/go.sum b/go.sum
index 753fb0b8bf2..76d79b1ea73 100644
--- a/go.sum
+++ b/go.sum
@@ -190,6 +190,7 @@ github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13P
 github.com/btcsuite/btcd v0.21.0-beta/go.mod h1:ZSWyehm27aAuS9bvkATT+Xte3hjHZ+MRgMY/8NJ7K94=
 github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E=
 github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8=
+github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc=
 github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA=
 github.com/btcsuite/btcutil v0.0.0-20190207003914-4c204d697803/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg=
 github.com/btcsuite/btcutil v0.0.0-20190425235716-9e5f4b9a998d/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg=
@@ -270,7 +271,9 @@ github.com/davidlazar/go-crypto v0.0.0-20170701192655-dcfb0a7ac018/go.mod h1:rQY
 github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c h1:pFUpOrbxDR6AkioZ1ySsx5yxlDQZ8stG2b88gTPxgJU=
 github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c/go.mod h1:6UhI8N9EjYm1c2odKpFpAYeR8dsBeM7PtzQhRgxRr9U=
 github.com/deckarep/golang-set v0.0.0-20180603214616-504e848d77ea/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ=
+github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc=
 github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y=
+github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs=
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs=
 github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0=
 github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218=
@@ -1250,20 +1253,9 @@ github.com/onflow/flow-core-contracts/lib/go/templates v1.2.3 h1:X25A1dNajNUtE+K
 github.com/onflow/flow-core-contracts/lib/go/templates v1.2.3/go.mod h1:dqAUVWwg+NlOhsuBHex7bEWmsUjsiExzhe/+t4xNH6A=
 github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtxNxrwTohgxJXCYqBE=
 github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU=
-<<<<<<< HEAD
-github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw=
-github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM=
-github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230424214110-4f04b71ea3e1 h1:QxQxCgce0tvAn/ibnEVYcUFRpy9QLxdfLRavKWYptvU=
-github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230424214110-4f04b71ea3e1/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk=
-github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA=
-github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8/go.mod h1:73C8FlT4L/Qe4Cf5iXUNL8b2pvu4zs5dJMMJ5V2TjUI=
-=======
 github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74=
 github.com/onflow/flow-go-sdk v0.41.10 h1:Cio6GJhtx532TUY+cqrqWglD5sZCXkWeM5QvaRha3p4=
 github.com/onflow/flow-go-sdk v0.41.10/go.mod h1:0a0LiQFbFt8RW/ptoMUU7YkvW9ArVcbjLE0XS78uz1E=
-github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ=
-github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs=
-github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0=
 github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0=
 github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY=
 github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8=
@@ -1271,7 +1263,6 @@ github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230628215638-83439d22e0ce h1:
 github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230628215638-83439d22e0ce/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk=
 github.com/onflow/go-bitswap v0.0.0-20230703214630-6d3db958c73d h1:QcOAeEyF3iAUHv21LQ12sdcsr0yFrJGoGLyCAzYYtvI=
 github.com/onflow/go-bitswap v0.0.0-20230703214630-6d3db958c73d/go.mod h1:GCPpiyRoHncdqPj++zPr9ZOYBX4hpJ0pYZRYqSE8VKk=
->>>>>>> master
 github.com/onflow/sdks v0.5.0 h1:2HCRibwqDaQ1c9oUApnkZtEAhWiNY2GTpRD5+ftdkN8=
 github.com/onflow/sdks v0.5.0/go.mod h1:F0dj0EyHC55kknLkeD10js4mo14yTdMotnWMslPirrU=
 github.com/onflow/wal v0.0.0-20230529184820-bc9f8244608d h1:gAEqYPn3DS83rHIKEpsajnppVD1+zwuYPFyeDVFaQvg=
@@ -1519,17 +1510,10 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
-<<<<<<< HEAD
-github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs=
-github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo=
-=======
 github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
 github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
 github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8=
 github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0=
-github.com/supranational/blst v0.3.4/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw=
-github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk=
->>>>>>> master
 github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ=
 github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA=
 github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
@@ -1681,7 +1665,6 @@ golang.org/x/crypto v0.0.0-20190618222545-ea8f1a30c443/go.mod h1:yigFU9vqHzYiE8U
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20200117160349-530e935923ad/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200311171314-f7b00557c8c4/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200423211502-4bdfaf469ed5/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
@@ -1697,6 +1680,7 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5
 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
 golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -1801,6 +1785,7 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b
 golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk=
 golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
 golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
@@ -1888,7 +1873,6 @@ golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200107162124-548cf772de50/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

From 7640d9420a540907f733f8914e1cca7646c4a245 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 13:28:12 -0600
Subject: [PATCH 168/200] remove deprecated Seed use

---
 crypto/bls12381_utils_test.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index a9efd543ed1..ade31bbb6b9 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -3,7 +3,6 @@ package crypto
 import (
 	"crypto/rand"
 	"encoding/hex"
-	mrand "math/rand"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -166,7 +165,7 @@ func TestSubgroupCheck(t *testing.T) {
 // subgroup membership check bench
 func BenchmarkSubgroupCheck(b *testing.B) {
 	seed := make([]byte, g2BytesLen)
-	_, err := mrand.Read(seed)
+	_, err := rand.Read(seed)
 	require.NoError(b, err)
 
 	b.Run("G1", func(b *testing.B) {

From 5c72468678d5682dd76209ccb33692a97f052dc3 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 15:16:32 -0600
Subject: [PATCH 169/200] add temp tmate

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b24de2f44ca..a1dec93631f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -205,6 +205,8 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
     - name: Docker build
       run: make docker-build-flow docker-build-flow-corrupt
     - name: Run tests

From 5011cc6d2888597d0f84385800bbb141ec1161e8 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 23:34:17 -0600
Subject: [PATCH 170/200] fix merging issues and delete relic related builds

---
 .github/workflows/flaky-test-monitor.yml |  2 --
 cmd/bootstrap/README.md                  |  5 -----
 config/README.md                         |  4 ++--
 integration/Makefile                     | 27 ++++++------------------
 integration/benchmark/server/bench.sh    |  4 +---
 5 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml
index f1c87d03348..c3b662fe070 100644
--- a/.github/workflows/flaky-test-monitor.yml
+++ b/.github/workflows/flaky-test-monitor.yml
@@ -168,8 +168,6 @@ jobs:
         with:
           go-version: ${{ env.GO_VERSION }}
           cache: true
-      - name: Build relic
-        run: make crypto_setup_gopath
       - name: Docker build
         run: make docker-build-flow docker-build-flow-corrupt
       - name: Run tests
diff --git a/cmd/bootstrap/README.md b/cmd/bootstrap/README.md
index f9c2b3f2e79..6b138946ca1 100644
--- a/cmd/bootstrap/README.md
+++ b/cmd/bootstrap/README.md
@@ -97,12 +97,7 @@ Each input is a config file specified as a command line parameter:
 
 #### Example
 ```bash
-<<<<<<< HEAD
 go run ./cmd/bootstrap finalize \
- --fast-kg \
-=======
-go run -tags relic ./cmd/bootstrap finalize \
->>>>>>> master
   --root-chain main \
   --root-height 0 \
   --root-parent 0000000000000000000000000000000000000000000000000000000000000000 \
diff --git a/config/README.md b/config/README.md
index f8a31bda478..a7045dd00e1 100644
--- a/config/README.md
+++ b/config/README.md
@@ -15,12 +15,12 @@ defined. A single default value can be overridden by setting the CLI flag for th
 config to false.
 Override entire config file.
 ```shell
-go build -tags relic -o flow-access-node ./cmd/access
+go build -o flow-access-node ./cmd/access
 ./flow-access-node --config-file=config/config.yml
 ```
 Override a single configuration value.
 ```shell
-go build -tags relic -o flow-access-node ./cmd/access
+go build -o flow-access-node ./cmd/access
 ./flow-access-node --network-connection-pruning=false
 ```
 ### Adding a new config value
diff --git a/integration/Makefile b/integration/Makefile
index 8feb33f72e6..963b7093511 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -58,16 +58,12 @@ consensus-tests:
 .PHONY: epochs-cohort1-tests
 epochs-cohort1-tests:
 	# Use a higher timeout of 20m for the suite of tests which span full epochs
-<<<<<<< HEAD
-	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/...
-=======
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort1/...
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort1/...
 
 .PHONY: epochs-cohort2-tests
 epochs-cohort2-tests:
 	# Use a higher timeout of 20m for the suite of tests which span full epochs
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort2/...
->>>>>>> master
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort2/...
 
 .PHONY: ghost-tests
 ghost-tests:
@@ -88,33 +84,22 @@ verification-tests:
 # upgrades-tests tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel
 .PHONY: upgrades-tests
 upgrades-tests:
-<<<<<<< HEAD
-	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/...
-=======
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/upgrades/... -p 1
->>>>>>> master
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... -p 1
 
 .PHONY: network-tests
 network-tests:
 	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/...
 
 # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel
-<<<<<<< HEAD
-.PHONY: bft-tests
-bft-tests:
-	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1
-
-=======
 .PHONY: bft-framework-tests
 bft-framework-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/framework/... -p 1
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/framework/... -p 1
 .PHONY: bft-protocol-tests
 bft-protocol-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/protocol/... -p 1
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/protocol/... -p 1
 .PHONY: bft-gossipsub-tests
 bft-gossipsub-tests:
-	go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/gossipsub/... -p 1
->>>>>>> master
+	$(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/gossipsub/... -p 1
 
 .PHONY: bft-tests
 bft-tests: bft-framework-tests bft-protocol-tests bft-gossipsub-tests
diff --git a/integration/benchmark/server/bench.sh b/integration/benchmark/server/bench.sh
index 6ada16119a1..8c87214a3b1 100755
--- a/integration/benchmark/server/bench.sh
+++ b/integration/benchmark/server/bench.sh
@@ -22,8 +22,6 @@ while read -r branch_hash; do
     git log --oneline | head -1
     git describe
 
-    make -C ../.. crypto_setup_gopath
-
     # instead of running "make stop" which uses docker-compose for a lot of older versions,
     # we explicitly run the command here with "docker compose"
     DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker compose -f docker-compose.nodes.yml down -v --remove-orphans
@@ -36,7 +34,7 @@ while read -r branch_hash; do
 
     # sleep is workaround for slow initialization of some node types, so that benchmark does not quit immediately with "connection refused"
     sleep 30;
-    go run -tags relic ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m
+    go run ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m
 
     # instead of running "make stop" which uses docker-compose for a lot of older versions,
     # we explicitly run the command here with "docker compose"

From c52da9c2d30ee14701894e53cdc6b401b6b50718 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 6 Sep 2023 23:34:32 -0600
Subject: [PATCH 171/200] Revert "add temp tmate"

This reverts commit 5c72468678d5682dd76209ccb33692a97f052dc3.
---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a1dec93631f..b24de2f44ca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -205,8 +205,6 @@ jobs:
       with:
         go-version: ${{ env.GO_VERSION }}
         cache: true
-    - name: Setup tmate session
-      uses: mxschmitt/action-tmate@v3
     - name: Docker build
       run: make docker-build-flow docker-build-flow-corrupt
     - name: Run tests

From e5c0630086fd083bd315031b229ff870ce0c6ef9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 00:18:25 -0600
Subject: [PATCH 172/200] update flakey test monitor

---
 .github/workflows/flaky-test-monitor.yml | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml
index c3b662fe070..aa9d99dd65b 100644
--- a/.github/workflows/flaky-test-monitor.yml
+++ b/.github/workflows/flaky-test-monitor.yml
@@ -83,18 +83,15 @@ jobs:
       matrix:
         include:
           - name: crypto
-            make1: -C crypto setup
-            make2: unittest
+            setup:
             race: 1
             test_category: unit-crypto
           - name: insecure
-            make1: install-tools
-            make2: test
+            setup: install-tools
             race: 0
             test_category: unit-insecure
           - name: integration
-            make1: install-tools
-            make2: test
+            setup: install-tools
             race: 0
             test_category: unit-integration
     runs-on: ubuntu-latest
@@ -107,11 +104,11 @@ jobs:
           go-version: ${{ env.GO_VERSION }}
           cache: true
       - name: Setup tests (${{ matrix.name }})
-        run: make ${{ matrix.make1 }}
+        run: make ${{ matrix.setup }}
       - name: Run tests (${{ matrix.name }})
         env:
           RACE_DETECTOR: ${{ matrix.race }}
-        run: make -es -C ${{ matrix.name }} ${{ matrix.make2 }} > test-output
+        run: make -es -C ${{ matrix.name }} test > test-output
         timeout-minutes: 100
         continue-on-error: true
       - name: Process test results (${{ matrix.name }})

From d1776c83e9212861a77ad1e9ca462a0029f86bc3 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 13:18:18 -0600
Subject: [PATCH 173/200] more clarifications to BLST version update README

---
 crypto/blst_src/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index d283b4dd6c4..d8d8be5313a 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -17,16 +17,16 @@ The folder contains:
 - this `README` file.
 
 To upgrade the BLST version:
-- [ ] delete all files in this folder (`./blst_src`) but `blst_src.c` and `README.md`.
+- [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`.
 - [ ] open BLST repository on the new version.
-- [ ] copy all `.c` and `.h` files from `<blst>/src/` into this folder.
-- [ ] delete `server.c` from this folder.
-- [ ] update `blst_src.c` if needed.
-- [ ] copy the folder `<blst>/build/` into this folder.
-- [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`.
-- [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into this folder.
-- [ ] check that C flags in `./bls12381_utils.go` still match the C flags in `<blst>/bindings/go/blst.go`.
+- [ ] copy all `.c` and `.h` files from `<blst>/src/` into `./blst_src/`.
+- [ ] delete `./blst_src/server.c`.
+- [ ] copy the folder `<blst>/build/` into this folder `./blst_src`.
+- [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`.
+- [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into `./blst_src`.
+- [ ] update `./blst_src/blst_src.c` if needed.
+- [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `<blst>/bindings/go/blst.go`.
 - [ ] solve all breaking changes that may occur.
-- [ ] update the commit version on this `README`.
+- [ ] update the commit version on this `./blst_src/README`.
 
 Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. 
\ No newline at end of file

From ecf702108f441d76aacc88cd5bf52ca67bdff0fc Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 13:51:42 -0600
Subject: [PATCH 174/200] update BLST source to v0.3.11

---
 crypto/blst_assembly.S                        |  49 ++-
 crypto/blst_src/LICENSE                       | 201 ------------
 crypto/blst_src/README.md                     |   9 +-
 crypto/blst_src/aggregate.c                   |   2 +-
 crypto/blst_src/blst_src.c                    |   3 +-
 crypto/blst_src/build/bindings_trim.pl        |  29 +-
 .../blst_src/build/coff/add_mod_256-x86_64.s  | 105 ++++---
 .../blst_src/build/coff/add_mod_384-x86_64.s  | 187 ++++++-----
 .../build/coff/add_mod_384x384-x86_64.s       |  28 +-
 .../build/coff/ct_inverse_mod_256-armv8.S     |   5 +-
 .../build/coff/ct_inverse_mod_256-x86_64.s    |  14 +-
 .../build/coff/ct_inverse_mod_384-armv8.S     |   5 +-
 .../build/coff/ct_is_square_mod_384-armv8.S   |   1 +
 .../build/coff/ct_is_square_mod_384-x86_64.s  |  10 +-
 .../build/coff/ctq_inverse_mod_384-x86_64.s   |  15 +-
 .../build/coff/ctx_inverse_mod_384-x86_64.s   |  21 +-
 crypto/blst_src/build/coff/div3w-armv8.S      |   2 +-
 crypto/blst_src/build/coff/div3w-x86_64.s     | 126 +++++++-
 .../build/coff/mulq_mont_256-x86_64.s         |  57 +++-
 .../build/coff/mulq_mont_384-x86_64.s         | 251 ++++++++++-----
 .../build/coff/mulx_mont_256-x86_64.s         |  44 ++-
 .../build/coff/mulx_mont_384-x86_64.s         | 207 ++++++++-----
 crypto/blst_src/build/coff/sha256-armv8.S     |   8 +-
 .../build/coff/sha256-portable-x86_64.s       |  54 ++--
 crypto/blst_src/build/coff/sha256-x86_64.s    | 182 +++++------
 .../build/elf/ct_inverse_mod_256-armv8.S      |   5 +-
 .../build/elf/ct_inverse_mod_256-x86_64.s     |   1 +
 .../build/elf/ct_inverse_mod_384-armv8.S      |   5 +-
 .../build/elf/ct_is_square_mod_384-armv8.S    |   1 +
 .../build/elf/ct_is_square_mod_384-x86_64.s   |   1 +
 .../build/elf/ctq_inverse_mod_384-x86_64.s    |   6 +
 .../build/elf/ctx_inverse_mod_384-x86_64.s    |  14 +-
 crypto/blst_src/build/elf/div3w-armv8.S       |   2 +-
 crypto/blst_src/build/elf/div3w-x86_64.s      |  15 +-
 .../blst_src/build/elf/mulq_mont_256-x86_64.s |  17 +
 .../blst_src/build/elf/mulq_mont_384-x86_64.s | 119 +++++--
 .../blst_src/build/elf/mulx_mont_256-x86_64.s |   4 +
 .../blst_src/build/elf/mulx_mont_384-x86_64.s |  69 +++--
 crypto/blst_src/build/elf/sha256-armv8.S      |   8 +-
 .../build/elf/sha256-portable-x86_64.s        |  36 ++-
 crypto/blst_src/build/elf/sha256-x86_64.s     |  67 ++--
 .../build/mach-o/ct_inverse_mod_256-armv8.S   |   5 +-
 .../build/mach-o/ct_inverse_mod_256-x86_64.s  |   1 +
 .../build/mach-o/ct_inverse_mod_384-armv8.S   |   5 +-
 .../build/mach-o/ct_is_square_mod_384-armv8.S |   1 +
 .../mach-o/ct_is_square_mod_384-x86_64.s      |   1 +
 .../build/mach-o/ctq_inverse_mod_384-x86_64.s |   6 +
 .../build/mach-o/ctx_inverse_mod_384-x86_64.s |  10 +-
 crypto/blst_src/build/mach-o/div3w-armv8.S    |   2 +-
 crypto/blst_src/build/mach-o/div3w-x86_64.s   |  15 +-
 .../build/mach-o/mulq_mont_256-x86_64.s       |  17 +
 .../build/mach-o/mulq_mont_384-x86_64.s       | 103 ++++--
 .../build/mach-o/mulx_mont_256-x86_64.s       |   4 +
 .../build/mach-o/mulx_mont_384-x86_64.s       |  53 ++--
 crypto/blst_src/build/mach-o/sha256-armv8.S   |   8 +-
 .../build/mach-o/sha256-portable-x86_64.s     |  36 ++-
 crypto/blst_src/build/mach-o/sha256-x86_64.s  |  67 ++--
 crypto/blst_src/build/refresh.sh              |  48 +++
 .../build/win64/add_mod_256-x86_64.asm        | 115 +++----
 .../build/win64/add_mod_384-x86_64.asm        | 215 +++++++------
 .../build/win64/add_mod_384x384-x86_64.asm    |  32 +-
 crypto/blst_src/build/win64/blst.def          |   4 +
 .../build/win64/ct_inverse_mod_256-armv8.asm  |   5 +-
 .../build/win64/ct_inverse_mod_256-x86_64.asm |  21 +-
 .../build/win64/ct_inverse_mod_384-armv8.asm  |   5 +-
 .../win64/ct_is_square_mod_384-armv8.asm      |   1 +
 .../win64/ct_is_square_mod_384-x86_64.asm     |  15 +-
 .../win64/ctq_inverse_mod_384-x86_64.asm      |  24 +-
 .../win64/ctx_inverse_mod_384-x86_64.asm      |  30 +-
 crypto/blst_src/build/win64/div3w-armv8.asm   |   2 +-
 crypto/blst_src/build/win64/div3w-x86_64.asm  | 117 ++++++-
 .../build/win64/mulq_mont_256-x86_64.asm      |  69 +++--
 .../build/win64/mulq_mont_384-x86_64.asm      | 292 ++++++++++++------
 .../build/win64/mulx_mont_256-x86_64.asm      |  54 ++--
 .../build/win64/mulx_mont_384-x86_64.asm      | 246 +++++++++------
 crypto/blst_src/build/win64/sha256-armv8.asm  |   8 +-
 crypto/blst_src/build/win64/sha256-x86_64.asm | 189 ++++++------
 crypto/blst_src/bulk_addition.c               |  12 +-
 crypto/blst_src/bytes.h                       |   4 +-
 crypto/blst_src/client_min_pk.c               |  17 +
 crypto/blst_src/client_min_sig.c              |  17 +
 crypto/blst_src/cpuid.c                       |  85 +++++
 crypto/blst_src/e1.c                          |   8 +-
 crypto/blst_src/e2.c                          |   8 +-
 crypto/blst_src/ec_mult.h                     |   5 +-
 crypto/blst_src/exports.c                     | 120 ++++---
 crypto/blst_src/fields.h                      |   4 +-
 crypto/blst_src/fp12_tower.c                  |   4 +-
 crypto/blst_src/multi_scalar.c                |  15 +-
 crypto/blst_src/pairing.c                     |  49 +++
 crypto/blst_src/pentaroot.c                   |   4 +-
 crypto/blst_src/vect.h                        |  16 +-
 92 files changed, 2650 insertions(+), 1529 deletions(-)
 delete mode 100644 crypto/blst_src/LICENSE
 create mode 100755 crypto/blst_src/build/refresh.sh
 create mode 100644 crypto/blst_src/client_min_pk.c
 create mode 100644 crypto/blst_src/client_min_sig.c
 create mode 100644 crypto/blst_src/cpuid.c

diff --git a/crypto/blst_assembly.S b/crypto/blst_assembly.S
index a1a7c5416e0..c0c5db30850 100644
--- a/crypto/blst_assembly.S
+++ b/crypto/blst_assembly.S
@@ -2,23 +2,22 @@
 # if defined(__ELF__)
 #  if defined(__BLST_PORTABLE__)
 #   include "elf/sha256-portable-x86_64.s"
-#  else
-#   include "elf/sha256-x86_64.s"
+#   define blst_sha256_block_data_order blst_sha256_block_ssse3
 #  endif
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  include "elf/sha256-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "elf/ctx_inverse_mod_384-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "elf/ctq_inverse_mod_384-x86_64.s"
 #  endif
 #  include "elf/add_mod_384-x86_64.s"
 #  include "elf/add_mod_384x384-x86_64.s"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  define __sub_mod_384x384 __sub_mont_384x384
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "elf/mulx_mont_384-x86_64.s"
 #   include "elf/mulx_mont_256-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "elf/mulq_mont_384-x86_64.s"
 #   include "elf/mulq_mont_256-x86_64.s"
 #  endif
@@ -27,25 +26,20 @@
 #  include "elf/div3w-x86_64.s"
 #  include "elf/ct_is_square_mod_384-x86_64.s"
 # elif defined(_WIN64) || defined(__CYGWIN__)
-#  if defined(__BLST_PORTABLE__)
-#   include "coff/sha256-portable-x86_64.s"
-#  else
-#   include "coff/sha256-x86_64.s"
-#  endif
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  include "coff/sha256-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "coff/ctx_inverse_mod_384-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "coff/ctq_inverse_mod_384-x86_64.s"
 #  endif
 #  include "coff/add_mod_384-x86_64.s"
 #  include "coff/add_mod_384x384-x86_64.s"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  define __sub_mod_384x384 __sub_mont_384x384
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "coff/mulx_mont_384-x86_64.s"
 #   include "coff/mulx_mont_256-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "coff/mulq_mont_384-x86_64.s"
 #   include "coff/mulq_mont_256-x86_64.s"
 #  endif
@@ -55,20 +49,19 @@
 #  include "coff/ct_is_square_mod_384-x86_64.s"
 # elif defined(__APPLE__)
 #  include "mach-o/sha256-x86_64.s"
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "mach-o/ctx_inverse_mod_384-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "mach-o/ctq_inverse_mod_384-x86_64.s"
 #  endif
 #  include "mach-o/add_mod_384-x86_64.s"
 #  include "mach-o/add_mod_384x384-x86_64.s"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  define __sub_mod_384x384 __sub_mont_384x384
-#  if defined(__ADX__) && !defined(__BLST_PORTABLE__)
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "mach-o/mulx_mont_384-x86_64.s"
 #   include "mach-o/mulx_mont_256-x86_64.s"
-#  else
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
 #   include "mach-o/mulq_mont_384-x86_64.s"
 #   include "mach-o/mulq_mont_256-x86_64.s"
 #  endif
diff --git a/crypto/blst_src/LICENSE b/crypto/blst_src/LICENSE
deleted file mode 100644
index 261eeb9e9f8..00000000000
--- a/crypto/blst_src/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index d8d8be5313a..ff63254bbe5 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -1,5 +1,5 @@
-All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst
-specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>. 
+All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst, 
+specifically from the tagged version v0.3.11.
 
  Copyright Supranational LLC
  Licensed under the Apache License, Version 2.0, see LICENSE for details.
@@ -10,10 +10,8 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp
 The folder contains:
 - BLST LICENSE file
 - all `<blst>/src/*.c` and `<blst>/src/*.h` files (C source files) but `server.c`.
-- `server.c` is replaced by `blst_src.c` (which lists only the files needed by Flow crypto).
+- `server.c` is replaced by `./blst_src.c` (which lists only the files needed by Flow crypto).
 - all `<blst>/build`   (assembly generated files).
-- `<blst>/bindings/blst.h`  (headers of external functions).
-- `<blst>/bindings/blst_aux.h` (headers of external aux functions).
 - this `README` file.
 
 To upgrade the BLST version:
@@ -23,7 +21,6 @@ To upgrade the BLST version:
 - [ ] delete `./blst_src/server.c`.
 - [ ] copy the folder `<blst>/build/` into this folder `./blst_src`.
 - [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`.
-- [ ] copy `<blst>/bindings/blst.h` and `<blst>/bindings/blst_aux.h` into `./blst_src`.
 - [ ] update `./blst_src/blst_src.c` if needed.
 - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `<blst>/bindings/go/blst.go`.
 - [ ] solve all breaking changes that may occur.
diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c
index 8a24e0590ba..ca78876acad 100644
--- a/crypto/blst_src/aggregate.c
+++ b/crypto/blst_src/aggregate.c
@@ -90,7 +90,7 @@ const void *blst_pairing_get_dst(const PAIRING *ctx)
 
 /*
  * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated
- * signature vetification as discussed at
+ * signature verification as discussed at
  * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407.
  * Usage pattern is not finalized yet, because (sig != NULL) is better and
  * will be handled separately...
diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c
index a50649e5788..9e064657e72 100644
--- a/crypto/blst_src/blst_src.c
+++ b/crypto/blst_src/blst_src.c
@@ -11,13 +11,14 @@
 #include "map_to_g2.c"
 #include "fp12_tower.c"
 #include "pairing.c"
-#include "aggregate.c"
 #include "exp.c"
 #include "sqrt.c"
 #include "recip.c"
+#include "aggregate.c"
 #include "bulk_addition.c"
 #include "multi_scalar.c"
 #include "consts.c"
 #include "vect.c"
 #include "exports.c"
 
+
diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl
index 90f914578d9..0880352d79e 100755
--- a/crypto/blst_src/build/bindings_trim.pl
+++ b/crypto/blst_src/build/bindings_trim.pl
@@ -5,6 +5,10 @@
 
 # traverse and remove auto-generated PartialEq for chosen types
 for (my $i = 0; $i <= $#file; $i++) {
+    if (@file[$i] =~ m/pub\s+(?:struct|enum)\s+(\w+)/) {
+        push @structs, $1;
+    }
+
     if (@file[$i] =~ m/struct\s+blst_p[12]/) {
         @file[$i-1] =~ s/,\s*PartialEq//;
     } elsif (@file[$i] =~ m/struct\s+blst_fp12/) {
@@ -15,23 +19,22 @@
         @file[$i-1] =~ s/,\s*Copy//;
         @file[$i-1] =~ s/\)/, Zeroize\)/;
         splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++;
-    } elsif (@file[$i] =~ m/assert_eq!\($/) {
-        @file[++$i] =~ s/unsafe\s*\{\s*&\(\*\(::std::ptr::null::<(\w+)>\(\)\)\)\.(\w+).*\}/offsetof!($1, $2)/;
+    } else {
+        @file[$i] =~ s/::std::/::core::/g;
     }
 }
 
+print @file;
+
 print << '___';
-#[cfg(test)]
-macro_rules! offsetof {
-    ($type:ty, $field:tt) => {
-        {
-            let v = <$type>::default();
-            (&v.$field as *const _ as usize) - (&v as *const _ as usize)
-        }
-    };
-}
+#[test]
+fn bindgen_test_normal_types() {
+    // from "Rust for Rustaceans" by Jon Gjengset
+    fn is_normal<T: Sized + Send + Sync + Unpin>() {}
 ___
-# print the file
-print @file;
+for (@structs) {
+    print "    is_normal::<$_>();\n";
+}
+print "}\n";
 
 close STDOUT;
diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s
index f88e6189ca5..c2c83502a18 100644
--- a/crypto/blst_src/build/coff/add_mod_256-x86_64.s
+++ b/crypto/blst_src/build/coff/add_mod_256-x86_64.s
@@ -10,14 +10,14 @@ add_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_add_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -81,13 +81,13 @@ mul_by_3_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_3_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -161,14 +161,14 @@ lshift_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_lshift_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -218,14 +218,14 @@ rshift_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_rshift_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -307,14 +307,14 @@ cneg_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_cneg_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -385,14 +385,14 @@ sub_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sub_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -454,10 +454,10 @@ check_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_check_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
 	movq	0(%rdi),%rax
 	movq	8(%rdi),%r9
 	movq	16(%rdi),%r10
@@ -497,14 +497,14 @@ add_n_check_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_add_n_check_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -573,14 +573,14 @@ sub_n_check_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sub_n_check_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -744,8 +744,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_add_mod_256_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -753,7 +754,8 @@ sub_n_check_mod_256:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_add_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -764,8 +766,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_3_mod_256_body:
 .byte	1,0,11,0
 .byte	0x00,0xc4,0x00,0x00
@@ -785,8 +788,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_lshift_mod_256_body:
 .byte	1,0,11,0
 .byte	0x00,0xc4,0x00,0x00
@@ -806,8 +810,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_rshift_mod_256_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -815,7 +820,8 @@ sub_n_check_mod_256:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_rshift_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -826,8 +832,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_cneg_mod_256_body:
 .byte	1,0,11,0
 .byte	0x00,0xc4,0x00,0x00
@@ -847,8 +854,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sub_mod_256_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -856,7 +864,8 @@ sub_n_check_mod_256:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sub_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -873,8 +882,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_add_n_check_mod_256_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -882,7 +892,8 @@ sub_n_check_mod_256:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_add_n_check_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -893,8 +904,9 @@ sub_n_check_mod_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sub_n_check_mod_256_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -902,7 +914,8 @@ sub_n_check_mod_256:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sub_n_check_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s
index d1c7ad6e689..3ef562a3bf2 100644
--- a/crypto/blst_src/build/coff/add_mod_384-x86_64.s
+++ b/crypto/blst_src/build/coff/add_mod_384-x86_64.s
@@ -10,14 +10,14 @@ add_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_add_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -118,14 +118,14 @@ add_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_add_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -186,14 +186,14 @@ rshift_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_rshift_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -315,13 +315,13 @@ div_by_2_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_div_by_2_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -387,14 +387,14 @@ lshift_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_lshift_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -528,13 +528,13 @@ mul_by_3_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_3_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -595,13 +595,13 @@ mul_by_8_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_8_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -669,13 +669,13 @@ mul_by_3_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_3_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -752,13 +752,13 @@ mul_by_8_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_8_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -845,14 +845,14 @@ cneg_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_cneg_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -952,14 +952,14 @@ sub_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sub_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -1058,14 +1058,14 @@ sub_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sub_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -1124,13 +1124,13 @@ mul_by_1_plus_i_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_by_1_plus_i_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
 	pushq	%rbx
 
 	pushq	%r12
@@ -1274,10 +1274,10 @@ sgn0_pty_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0_pty_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
 .LSEH_body_sgn0_pty_mod_384:
 
 	movq	0(%rdi),%r8
@@ -1328,12 +1328,12 @@ sgn0_pty_mod_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0_pty_mod_384x:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
 	pushq	%rbx
 
 	subq	$8,%rsp
@@ -2134,8 +2134,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_add_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2147,7 +2148,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_add_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2158,8 +2160,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_add_mod_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x03,0x00
@@ -2171,7 +2174,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x0a,0x00
 .byte	0x00,0x64,0x0b,0x00
 .byte	0x00,0x82
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_add_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2182,8 +2186,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_rshift_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2195,7 +2200,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_rshift_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2206,8 +2212,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_div_by_2_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2219,7 +2226,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_div_by_2_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2230,8 +2238,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_lshift_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2243,7 +2252,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_lshift_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2254,8 +2264,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_3_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2267,7 +2278,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_by_3_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2278,8 +2290,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_8_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2291,7 +2304,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_by_8_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2302,8 +2316,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_3_mod_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2315,7 +2330,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_by_3_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2326,8 +2342,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_8_mod_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2339,7 +2356,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_by_8_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2350,8 +2368,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_cneg_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2363,7 +2382,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_cneg_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2374,8 +2394,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sub_mod_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -2387,7 +2408,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sub_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2398,8 +2420,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sub_mod_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x03,0x00
@@ -2411,7 +2434,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x0a,0x00
 .byte	0x00,0x64,0x0b,0x00
 .byte	0x00,0x82
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sub_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2422,8 +2446,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_by_1_plus_i_mod_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x07,0x00
@@ -2435,7 +2460,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x0e,0x00
 .byte	0x00,0x64,0x0f,0x00
 .byte	0x00,0xc2
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2446,8 +2472,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0_pty_mod_384_body:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -2463,8 +2490,9 @@ vec_is_equal_16x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0_pty_mod_384x_body:
 .byte	1,0,9,0
 .byte	0x00,0x34,0x01,0x00
@@ -2472,7 +2500,8 @@ vec_is_equal_16x:
 .byte	0x00,0x74,0x04,0x00
 .byte	0x00,0x64,0x05,0x00
 .byte	0x00,0x22
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sgn0_pty_mod_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
index 79976cc0e7a..53662b4a56a 100644
--- a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
+++ b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s
@@ -145,14 +145,14 @@ add_mod_384x384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_add_mod_384x384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -202,14 +202,14 @@ sub_mod_384x384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sub_mod_384x384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -280,8 +280,9 @@ sub_mod_384x384:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_add_mod_384x384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -293,7 +294,8 @@ sub_mod_384x384:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_add_mod_384x384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -304,8 +306,9 @@ sub_mod_384x384:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sub_mod_384x384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -317,7 +320,8 @@ sub_mod_384x384:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sub_mod_384x384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
index 17c3d25278f..d2fd83182b4 100644
--- a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_inverse_mod_256
+
 .def	ct_inverse_mod_256;
 .type	32;
 .endef
@@ -62,14 +63,14 @@ ct_inverse_mod_256:
 	madd	x4, x16, x8, xzr	// |u|*|f0|
 	madd	x4, x17, x9, x4	// |v|*|g0|
 	str	x4, [x0,#8*4]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*5]
 	stp	x5, x5, [x0,#8*7]
 
 	madd	x4, x12, x8, xzr	// |u|*|f1|
 	madd	x4, x13, x9, x4	// |v|*|g1|
 	str	x4, [x0,#8*9]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*10]
 	stp	x5, x5, [x0,#8*12]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
index e7d4a6313b1..d1aa7597bc0 100644
--- a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ct_inverse_mod_256
+
 .def	ct_inverse_mod_256;	.scl 2;	.type 32;	.endef
 .p2align	5
 ct_inverse_mod_256:
@@ -9,14 +10,14 @@ ct_inverse_mod_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_ct_inverse_mod_256:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-	movq	%r9,%rcx
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	movq	%r9,%rcx
 	pushq	%rbx
 
 	pushq	%r12
@@ -1188,8 +1189,9 @@ __inner_loop_62_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_ct_inverse_mod_256_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x86,0x00
@@ -1201,6 +1203,8 @@ __inner_loop_62_256:
 .byte	0x00,0x74,0x8d,0x00
 .byte	0x00,0x64,0x8e,0x00
 .byte	0x00,0x01,0x8c,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_ct_inverse_mod_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
index 65193f1e96a..86fdc405828 100644
--- a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
+++ b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_inverse_mod_383
+
 .def	ct_inverse_mod_383;
 .type	32;
 .endef
@@ -73,7 +74,7 @@ ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*6]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*8]
 	stp	x5, x5, [x0,#8*10]
 
@@ -84,7 +85,7 @@ ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*12]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*14]
 	stp	x5, x5, [x0,#8*16]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
index 34336ff486b..efe90a82144 100644
--- a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
+++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_is_square_mod_384
+
 .def	ct_is_square_mod_384;
 .type	32;
 .endef
diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
index ee4790321e6..9ac32f50852 100644
--- a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
+++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ct_is_square_mod_384
+
 .def	ct_is_square_mod_384;	.scl 2;	.type 32;	.endef
 .p2align	5
 ct_is_square_mod_384:
@@ -9,12 +10,12 @@ ct_is_square_mod_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_ct_is_square_mod_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
 	pushq	%rbp
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
 	pushq	%rbx
 
 	pushq	%r12
@@ -484,8 +485,9 @@ __inner_loop_48:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_ct_is_square_mod_384_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x43,0x00
@@ -497,6 +499,8 @@ __inner_loop_48:
 .byte	0x00,0x74,0x4a,0x00
 .byte	0x00,0x64,0x4b,0x00
 .byte	0x00,0x01,0x49,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_ct_is_square_mod_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
index 42f058a3c8d..d027a6dc5c0 100644
--- a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s
@@ -1,6 +1,8 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	ct_inverse_mod_383
+
 .def	ct_inverse_mod_383;	.scl 2;	.type 32;	.endef
 .p2align	5
 ct_inverse_mod_383:
@@ -9,12 +11,16 @@ ct_inverse_mod_383:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_ct_inverse_mod_383:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	ct_inverse_mod_383$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1200,8 +1206,9 @@ __inner_loop_62:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_ct_inverse_mod_383_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x8b,0x00
@@ -1213,6 +1220,8 @@ __inner_loop_62:
 .byte	0x00,0x74,0x92,0x00
 .byte	0x00,0x64,0x93,0x00
 .byte	0x00,0x01,0x91,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_ct_inverse_mod_383_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
index 7c13e56eb2a..4f7dd6d1552 100644
--- a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ctx_inverse_mod_383
+
 .def	ctx_inverse_mod_383;	.scl 2;	.type 32;	.endef
 .p2align	5
 ctx_inverse_mod_383:
@@ -9,12 +10,13 @@ ctx_inverse_mod_383:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_ctx_inverse_mod_383:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+ct_inverse_mod_383$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -812,7 +814,7 @@ ctx_inverse_mod_383:
 
 	movq	48(%rsi),%r10
 
-	call	__inner_loop_62
+	call	__tail_loop_53
 
 
 
@@ -1514,9 +1516,9 @@ __inner_loop_31:
 	.byte	0xf3,0xc3
 
 
-.def	__inner_loop_62;	.scl 3;	.type 32;	.endef
+.def	__tail_loop_53;	.scl 3;	.type 32;	.endef
 .p2align	5
-__inner_loop_62:
+__tail_loop_53:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	$1,%rdx
@@ -1524,7 +1526,7 @@ __inner_loop_62:
 	xorq	%r12,%r12
 	movq	$1,%r13
 
-.Loop_62:
+.Loop_53:
 	xorq	%rax,%rax
 	testq	$1,%r8
 	movq	%r10,%rbx
@@ -1551,7 +1553,7 @@ __inner_loop_62:
 	subq	%rax,%rdx
 	subq	%rbx,%rcx
 	subl	$1,%edi
-	jnz	.Loop_62
+	jnz	.Loop_53
 
 	.byte	0xf3,0xc3
 
@@ -1575,8 +1577,9 @@ __inner_loop_62:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_ctx_inverse_mod_383_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x8b,0x00
@@ -1588,6 +1591,8 @@ __inner_loop_62:
 .byte	0x00,0x74,0x92,0x00
 .byte	0x00,0x64,0x93,0x00
 .byte	0x00,0x01,0x91,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_ctx_inverse_mod_383_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S
index c17b9e38336..2e5d7045d6a 100644
--- a/crypto/blst_src/build/coff/div3w-armv8.S
+++ b/crypto/blst_src/build/coff/div3w-armv8.S
@@ -27,7 +27,7 @@ div_3_limbs:
 	asr	x3,x0,#63	// top bit -> mask
 	add	x0,x0,x0	// Q <<= 1
 	subs	x6,x4,x1	// R - D
-	add	x0,x0,#1	// Q + specilative bit
+	add	x0,x0,#1	// Q + speculative bit
 	sbcs	x7,x5,x2
 	sbc	x0,x0,xzr	// subtract speculative bit
 
diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s
index fcfe54480be..033d1eb3055 100644
--- a/crypto/blst_src/build/coff/div3w-x86_64.s
+++ b/crypto/blst_src/build/coff/div3w-x86_64.s
@@ -8,11 +8,14 @@ div_3_limbs:
 	.byte	0xf3,0x0f,0x1e,0xfa
 	movq	%rdi,8(%rsp)
 	movq	%rsi,16(%rsp)
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 .LSEH_begin_div_3_limbs:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
+.LSEH_body_div_3_limbs:
 
 	movq	(%rdi),%r8
 	movq	8(%rdi),%r9
@@ -45,9 +48,12 @@ div_3_limbs:
 
 	orq	%rcx,%rax
 
-	movq	8(%rsp),%rdi
-	movq	16(%rsp),%rsi
+.LSEH_epilogue_div_3_limbs:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
 	.byte	0xf3,0xc3
+
 .LSEH_end_div_3_limbs:
 .globl	quot_rem_128
 
@@ -57,11 +63,14 @@ quot_rem_128:
 	.byte	0xf3,0x0f,0x1e,0xfa
 	movq	%rdi,8(%rsp)
 	movq	%rsi,16(%rsp)
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 .LSEH_begin_quot_rem_128:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
+.LSEH_body_quot_rem_128:
 
 	movq	%rdx,%rax
 	movq	%rdx,%rcx
@@ -97,9 +106,12 @@ quot_rem_128:
 
 	movq	%rcx,%rax
 
-	movq	8(%rsp),%rdi
-	movq	16(%rsp),%rsi
+.LSEH_epilogue_quot_rem_128:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
 	.byte	0xf3,0xc3
+
 .LSEH_end_quot_rem_128:
 
 
@@ -114,11 +126,14 @@ quot_rem_64:
 	.byte	0xf3,0x0f,0x1e,0xfa
 	movq	%rdi,8(%rsp)
 	movq	%rsi,16(%rsp)
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 .LSEH_begin_quot_rem_64:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
+.LSEH_body_quot_rem_64:
 
 	movq	%rdx,%rax
 	imulq	0(%rsi),%rdx
@@ -130,11 +145,104 @@ quot_rem_64:
 	movq	%r10,0(%rdi)
 	movq	%rax,8(%rdi)
 
-	movq	8(%rsp),%rdi
-	movq	16(%rsp),%rsi
+.LSEH_epilogue_quot_rem_64:
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
+
 	.byte	0xf3,0xc3
+
 .LSEH_end_quot_rem_64:
 .section	.pdata
 .p2align	2
+.rva	.LSEH_begin_div_3_limbs
+.rva	.LSEH_body_div_3_limbs
+.rva	.LSEH_info_div_3_limbs_prologue
+
+.rva	.LSEH_body_div_3_limbs
+.rva	.LSEH_epilogue_div_3_limbs
+.rva	.LSEH_info_div_3_limbs_body
+
+.rva	.LSEH_epilogue_div_3_limbs
+.rva	.LSEH_end_div_3_limbs
+.rva	.LSEH_info_div_3_limbs_epilogue
+
+.rva	.LSEH_begin_quot_rem_128
+.rva	.LSEH_body_quot_rem_128
+.rva	.LSEH_info_quot_rem_128_prologue
+
+.rva	.LSEH_body_quot_rem_128
+.rva	.LSEH_epilogue_quot_rem_128
+.rva	.LSEH_info_quot_rem_128_body
+
+.rva	.LSEH_epilogue_quot_rem_128
+.rva	.LSEH_end_quot_rem_128
+.rva	.LSEH_info_quot_rem_128_epilogue
+
+.rva	.LSEH_begin_quot_rem_64
+.rva	.LSEH_body_quot_rem_64
+.rva	.LSEH_info_quot_rem_64_prologue
+
+.rva	.LSEH_body_quot_rem_64
+.rva	.LSEH_epilogue_quot_rem_64
+.rva	.LSEH_info_quot_rem_64_body
+
+.rva	.LSEH_epilogue_quot_rem_64
+.rva	.LSEH_end_quot_rem_64
+.rva	.LSEH_info_quot_rem_64_epilogue
+
 .section	.xdata
 .p2align	3
+.LSEH_info_div_3_limbs_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0xb3
+.byte	0,0
+.long	0,0
+.LSEH_info_div_3_limbs_body:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+.LSEH_info_div_3_limbs_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_quot_rem_128_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0xb3
+.byte	0,0
+.long	0,0
+.LSEH_info_quot_rem_128_body:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+.LSEH_info_quot_rem_128_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
+.LSEH_info_quot_rem_64_prologue:
+.byte	1,0,5,0x0b
+.byte	0,0x74,1,0
+.byte	0,0x64,2,0
+.byte	0,0xb3
+.byte	0,0
+.long	0,0
+.LSEH_info_quot_rem_64_body:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+.LSEH_info_quot_rem_64_epilogue:
+.byte	1,0,4,0
+.byte	0x00,0x74,0x01,0x00
+.byte	0x00,0x64,0x02,0x00
+.byte	0x00,0x00,0x00,0x00
+
diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
index dd1e00fa301..2dd30bc5b5d 100644
--- a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
+++ b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	mul_mont_sparse_256
@@ -10,13 +11,17 @@ mul_mont_sparse_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_mont_sparse_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_sparse_256$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -80,12 +85,16 @@ sqr_mont_sparse_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_mont_sparse_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_sparse_256$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -430,12 +439,16 @@ from_mont_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_from_mont_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	from_mont_256$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -510,12 +523,16 @@ redc_mont_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_redc_mont_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	redc_mont_256$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -778,8 +795,9 @@ __mulq_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_mont_sparse_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -791,7 +809,8 @@ __mulq_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_mont_sparse_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -802,8 +821,9 @@ __mulq_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_mont_sparse_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -815,7 +835,8 @@ __mulq_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_mont_sparse_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -826,8 +847,9 @@ __mulq_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_from_mont_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -839,7 +861,8 @@ __mulq_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_from_mont_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -850,8 +873,9 @@ __mulq_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_redc_mont_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -863,7 +887,8 @@ __mulq_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_redc_mont_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
index 5663463524a..ee646f5b137 100644
--- a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
+++ b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 
@@ -6,9 +7,9 @@
 
 
 
-.def	__sub_mod_384x384;	.scl 3;	.type 32;	.endef
+.def	__subq_mod_384x384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__sub_mod_384x384:
+__subq_mod_384x384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -73,9 +74,9 @@ __sub_mod_384x384:
 	.byte	0xf3,0xc3
 
 
-.def	__add_mod_384;	.scl 3;	.type 32;	.endef
+.def	__addq_mod_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__add_mod_384:
+__addq_mod_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -123,9 +124,9 @@ __add_mod_384:
 	.byte	0xf3,0xc3
 
 
-.def	__sub_mod_384;	.scl 3;	.type 32;	.endef
+.def	__subq_mod_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__sub_mod_384:
+__subq_mod_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -135,7 +136,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subq_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -182,13 +183,17 @@ mul_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_384x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -229,12 +234,12 @@ mul_mont_384x:
 	movq	8(%rsp),%rcx
 	leaq	-48(%rsi),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -244,17 +249,17 @@ mul_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	movq	%rcx,%rbx
 
@@ -263,14 +268,14 @@ mul_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -304,12 +309,16 @@ sqr_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_384x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -335,13 +344,13 @@ sqr_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subq_mod_384
 
 
 	movq	16(%rsp),%rsi
@@ -433,12 +442,16 @@ mul_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_382x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -528,18 +541,18 @@ mul_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -573,11 +586,15 @@ sqr_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_382x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -628,7 +645,7 @@ sqr_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subq_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -710,11 +727,15 @@ mul_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1039,10 +1060,14 @@ sqr_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1286,12 +1311,16 @@ sqr_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1321,7 +1350,7 @@ sqr_mont_384:
 	movq	104(%rsp),%rbx
 	movq	112(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	120(%rsp),%r8
 	movq	120(%rsp),%r15
@@ -1358,12 +1387,16 @@ redc_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_redc_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	redc_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1383,7 +1416,7 @@ redc_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movq	8(%rsp),%r15
 
@@ -1420,12 +1453,16 @@ from_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_from_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	from_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1795,9 +1832,9 @@ __mulq_by_1_mont_384:
 	.byte	0xf3,0xc3
 
 
-.def	__redc_tail_mont_384;	.scl 3;	.type 32;	.endef
+.def	__redq_tail_mont_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__redc_tail_mont_384:
+__redq_tail_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	addq	48(%rsi),%r14
@@ -1852,11 +1889,15 @@ sgn0_pty_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0_pty_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1934,11 +1975,15 @@ sgn0_pty_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0_pty_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2065,13 +2110,17 @@ mul_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mul_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2733,14 +2782,18 @@ sqr_n_mul_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_n_mul_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
 	movq	48(%rsp),%r9
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_384$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2774,7 +2827,7 @@ sqr_n_mul_mont_384:
 	movq	0(%rsp),%rcx
 	movq	16(%rsp),%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movd	%xmm1,%edx
 	leaq	0(%rdi),%rsi
@@ -2828,14 +2881,18 @@ sqr_n_mul_mont_383:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_n_mul_mont_383:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
 	movq	48(%rsp),%r9
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_383$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3494,12 +3551,16 @@ sqr_mont_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqr_mont_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_382x$1
+#endif
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3858,8 +3919,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_mont_384x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x29,0x00
@@ -3871,6 +3933,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x30,0x00
 .byte	0x00,0x64,0x31,0x00
 .byte	0x00,0x01,0x2f,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3881,8 +3945,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_mont_384x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -3894,6 +3959,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3904,8 +3971,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_382x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -3917,6 +3985,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3927,8 +3997,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_382x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3940,7 +4011,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3951,8 +4023,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_384_body:
 .byte	1,0,11,0
 .byte	0x00,0xc4,0x00,0x00
@@ -3972,8 +4045,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3985,7 +4059,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3996,8 +4071,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_mont_384_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x0f,0x00
@@ -4009,6 +4085,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x16,0x00
 .byte	0x00,0x64,0x17,0x00
 .byte	0x00,0x01,0x15,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4019,8 +4097,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_redc_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -4032,7 +4111,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_redc_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4043,8 +4123,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_from_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -4056,7 +4137,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_from_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4067,8 +4149,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0_pty_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -4080,7 +4163,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sgn0_pty_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4091,8 +4175,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0_pty_mont_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -4104,7 +4189,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sgn0_pty_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4115,8 +4201,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mul_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x03,0x00
@@ -4128,7 +4215,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x0a,0x00
 .byte	0x00,0x64,0x0b,0x00
 .byte	0x00,0x82
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mul_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4139,8 +4227,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_n_mul_mont_384_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -4152,6 +4241,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_n_mul_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4162,8 +4253,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_n_mul_mont_383_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -4175,6 +4267,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_n_mul_mont_383_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -4185,8 +4279,9 @@ sqr_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqr_mont_382x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -4198,6 +4293,8 @@ sqr_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqr_mont_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
index 75c7e82bc1a..cba65569c52 100644
--- a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
+++ b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s
@@ -10,13 +10,14 @@ mulx_mont_sparse_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mulx_mont_sparse_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+mul_mont_sparse_256$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -78,12 +79,13 @@ sqrx_mont_sparse_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_mont_sparse_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+sqr_mont_sparse_256$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -342,12 +344,13 @@ fromx_mont_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_fromx_mont_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+from_mont_256$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -422,12 +425,13 @@ redcx_mont_256:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_redcx_mont_256:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+redc_mont_256$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -690,8 +694,9 @@ __mulx_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mulx_mont_sparse_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -703,7 +708,8 @@ __mulx_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mulx_mont_sparse_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -714,8 +720,9 @@ __mulx_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_mont_sparse_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -727,7 +734,8 @@ __mulx_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_mont_sparse_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -738,8 +746,9 @@ __mulx_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_fromx_mont_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -751,7 +760,8 @@ __mulx_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_fromx_mont_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -762,8 +772,9 @@ __mulx_by_1_mont_256:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_redcx_mont_256_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -775,7 +786,8 @@ __mulx_by_1_mont_256:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_redcx_mont_256_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
index 12306a7ff5c..ce1354f46b4 100644
--- a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
+++ b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s
@@ -6,9 +6,9 @@
 
 
 
-.def	__sub_mod_384x384;	.scl 3;	.type 32;	.endef
+.def	__subx_mod_384x384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__sub_mod_384x384:
+__subx_mod_384x384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -73,9 +73,9 @@ __sub_mod_384x384:
 	.byte	0xf3,0xc3
 
 
-.def	__add_mod_384;	.scl 3;	.type 32;	.endef
+.def	__addx_mod_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__add_mod_384:
+__addx_mod_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -123,9 +123,9 @@ __add_mod_384:
 	.byte	0xf3,0xc3
 
 
-.def	__sub_mod_384;	.scl 3;	.type 32;	.endef
+.def	__subx_mod_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__sub_mod_384:
+__subx_mod_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	movq	0(%rsi),%r8
@@ -135,7 +135,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subx_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -182,13 +182,14 @@ mulx_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mulx_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+mul_mont_384x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -230,12 +231,12 @@ mulx_mont_384x:
 	leaq	(%rbx),%rsi
 	leaq	-48(%rbx),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -245,17 +246,17 @@ mulx_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rcx),%rbx
 
@@ -264,14 +265,14 @@ mulx_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -305,12 +306,13 @@ sqrx_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+sqr_mont_384x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -337,13 +339,13 @@ sqrx_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subx_mod_384
 
 
 	movq	24(%rsp),%rsi
@@ -445,12 +447,13 @@ mulx_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mulx_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+mul_382x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -540,18 +543,18 @@ mulx_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -585,11 +588,12 @@ sqrx_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+sqr_382x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -640,7 +644,7 @@ sqrx_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subx_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -722,11 +726,12 @@ mulx_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mulx_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+mul_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -950,10 +955,11 @@ sqrx_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_384:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
 
 
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+sqr_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1145,12 +1151,13 @@ redcx_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_redcx_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+redc_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1170,7 +1177,7 @@ redcx_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	movq	8(%rsp),%r15
 
@@ -1207,12 +1214,13 @@ fromx_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_fromx_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+from_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1473,9 +1481,9 @@ __mulx_by_1_mont_384:
 	.byte	0xf3,0xc3
 
 
-.def	__redc_tail_mont_384;	.scl 3;	.type 32;	.endef
+.def	__redx_tail_mont_384;	.scl 3;	.type 32;	.endef
 .p2align	5
-__redc_tail_mont_384:
+__redx_tail_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 	addq	48(%rsi),%r14
@@ -1530,11 +1538,12 @@ sgn0x_pty_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0x_pty_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+sgn0_pty_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1612,11 +1621,12 @@ sgn0x_pty_mont_384x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sgn0x_pty_mont_384x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+sgn0_pty_mont_384x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -1743,13 +1753,14 @@ mulx_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_mulx_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
-
-
+mul_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2215,12 +2226,13 @@ sqrx_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+sqr_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2287,14 +2299,15 @@ sqrx_n_mul_mont_384:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_n_mul_mont_384:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
 	movq	48(%rsp),%r9
-
-
+sqr_n_mul_mont_384$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2379,14 +2392,15 @@ sqrx_n_mul_mont_383:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_n_mul_mont_383:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
 	movq	40(%rsp),%r8
 	movq	48(%rsp),%r9
-
-
+sqr_n_mul_mont_383$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -2831,12 +2845,13 @@ sqrx_mont_382x:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_sqrx_mont_382x:
+
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
 	movq	%r9,%rcx
-
-
+sqr_mont_382x$1:
 	pushq	%rbp
 
 	pushq	%rbx
@@ -3205,8 +3220,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mulx_mont_384x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x29,0x00
@@ -3218,6 +3234,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x30,0x00
 .byte	0x00,0x64,0x31,0x00
 .byte	0x00,0x01,0x2f,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mulx_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3228,8 +3246,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_mont_384x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -3241,6 +3260,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3251,8 +3272,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mulx_382x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -3264,6 +3286,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mulx_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3274,8 +3298,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_382x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3287,7 +3312,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3298,8 +3324,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mulx_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x00,0x00
@@ -3311,7 +3338,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x07,0x00
 .byte	0x00,0x64,0x08,0x00
 .byte	0x00,0x52
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mulx_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3322,8 +3350,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3335,7 +3364,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3346,8 +3376,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_redcx_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3359,7 +3390,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_redcx_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3370,8 +3402,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_fromx_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3383,7 +3416,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_fromx_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3394,8 +3428,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0x_pty_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3407,7 +3442,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sgn0x_pty_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3418,8 +3454,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sgn0x_pty_mont_384x_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x01,0x00
@@ -3431,7 +3468,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x08,0x00
 .byte	0x00,0x64,0x09,0x00
 .byte	0x00,0x62
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sgn0x_pty_mont_384x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3442,8 +3480,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_mulx_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x03,0x00
@@ -3455,7 +3494,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x0a,0x00
 .byte	0x00,0x64,0x0b,0x00
 .byte	0x00,0x82
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_mulx_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3466,8 +3506,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x03,0x00
@@ -3479,7 +3520,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x0a,0x00
 .byte	0x00,0x64,0x0b,0x00
 .byte	0x00,0x82
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3490,8 +3532,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_n_mul_mont_384_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x05,0x00
@@ -3503,7 +3546,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x0c,0x00
 .byte	0x00,0x64,0x0d,0x00
 .byte	0x00,0xa2
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_n_mul_mont_384_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3514,8 +3558,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_n_mul_mont_383_body:
 .byte	1,0,17,0
 .byte	0x00,0xf4,0x05,0x00
@@ -3527,7 +3572,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x0c,0x00
 .byte	0x00,0x64,0x0d,0x00
 .byte	0x00,0xa2
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_n_mul_mont_383_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
@@ -3538,8 +3584,9 @@ sqrx_mont_382x:
 .byte	1,0,5,0x0b
 .byte	0,0x74,1,0
 .byte	0,0x64,2,0
-.byte	0,0x03
+.byte	0,0xb3
 .byte	0,0
+.long	0,0
 .LSEH_info_sqrx_mont_382x_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x11,0x00
@@ -3551,6 +3598,8 @@ sqrx_mont_382x:
 .byte	0x00,0x74,0x18,0x00
 .byte	0x00,0x64,0x19,0x00
 .byte	0x00,0x01,0x17,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_sqrx_mont_382x_epilogue:
 .byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S
index a8bcbd3631b..a4cd8090896 100644
--- a/crypto/blst_src/build/coff/sha256-armv8.S
+++ b/crypto/blst_src/build/coff/sha256-armv8.S
@@ -10,11 +10,12 @@
 //
 // sha256_block procedure for ARMv8.
 //
-// This module is stripped of scalar code paths, with raionale that all
+// This module is stripped of scalar code paths, with rationale that all
 // known processors are NEON-capable.
 //
 // See original module at CRYPTOGAMS for further details.
 
+.comm	__blst_platform_cap,4
 .text
 
 .p2align	6
@@ -188,6 +189,11 @@ blst_sha256_block_armv8:
 .endef
 .p2align	4
 blst_sha256_block_data_order:
+	adrp	x16,__blst_platform_cap
+	ldr	w16,[x16,#:lo12:__blst_platform_cap]
+	tst	w16,#1
+	b.ne	.Lv8_entry
+
 	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 	sub	sp,sp,#16*4
diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s
index e499d107c70..603e46c53d7 100644
--- a/crypto/blst_src/build/coff/sha256-portable-x86_64.s
+++ b/crypto/blst_src/build/coff/sha256-portable-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	blst_sha256_block_data_order
@@ -9,15 +10,21 @@ blst_sha256_block_data_order:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_blst_sha256_block_data_order:
+
+
+	pushq	%rbp
+
+	movq	%rsp,%rbp
+
 	movq	%rcx,%rdi
 	movq	%rdx,%rsi
 	movq	%r8,%rdx
-
-
+#ifdef __BLST_PORTABLE__
+	testl	$2,__blst_platform_cap(%rip)
+	jnz	.Lblst_sha256_block_data_order$2
+#endif
 	pushq	%rbx
 
-	pushq	%rbp
-
 	pushq	%r12
 
 	pushq	%r13
@@ -29,12 +36,13 @@ blst_sha256_block_data_order:
 	shlq	$4,%rdx
 	subq	$64+24,%rsp
 
+
+.LSEH_body_blst_sha256_block_data_order:
+
 	leaq	(%rsi,%rdx,4),%rdx
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-.LSEH_body_blst_sha256_block_data_order:
-
 
 	movl	0(%rdi),%eax
 	movl	4(%rdi),%ebx
@@ -1637,17 +1645,11 @@ blst_sha256_block_data_order:
 	leaq	64+24+48(%rsp),%r11
 
 	movq	64+24(%rsp),%r15
-
 	movq	-40(%r11),%r14
-
 	movq	-32(%r11),%r13
-
 	movq	-24(%r11),%r12
-
-	movq	-16(%r11),%rbp
-
-	movq	-8(%r11),%rbx
-
+	movq	-16(%r11),%rbx
+	movq	-8(%r11),%rbp
 .LSEH_epilogue_blst_sha256_block_data_order:
 	mov	8(%r11),%rdi
 	mov	16(%r11),%rsi
@@ -1657,6 +1659,7 @@ blst_sha256_block_data_order:
 
 .LSEH_end_blst_sha256_block_data_order:
 
+#ifndef __BLST_PORTABLE__
 .p2align	6
 
 K256:
@@ -1742,6 +1745,7 @@ blst_sha256_hcopy:
 	movq	%r11,24(%rcx)
 	.byte	0xf3,0xc3
 
+#endif
 .section	.pdata
 .p2align	2
 .rva	.LSEH_begin_blst_sha256_block_data_order
@@ -1759,26 +1763,30 @@ blst_sha256_hcopy:
 .section	.xdata
 .p2align	3
 .LSEH_info_blst_sha256_block_data_order_prologue:
-.byte	1,0,5,0x0b
-.byte	0,0x74,1,0
-.byte	0,0x64,2,0
-.byte	0,0x03
-.byte	0,0
+.byte	1,4,6,0x05
+.byte	4,0x74,2,0
+.byte	4,0x64,3,0
+.byte	4,0x53
+.byte	1,0x50
+.long	0,0
 .LSEH_info_blst_sha256_block_data_order_body:
 .byte	1,0,18,0
 .byte	0x00,0xf4,0x0b,0x00
 .byte	0x00,0xe4,0x0c,0x00
 .byte	0x00,0xd4,0x0d,0x00
 .byte	0x00,0xc4,0x0e,0x00
-.byte	0x00,0x54,0x0f,0x00
-.byte	0x00,0x34,0x10,0x00
+.byte	0x00,0x34,0x0f,0x00
+.byte	0x00,0x54,0x10,0x00
 .byte	0x00,0x74,0x12,0x00
 .byte	0x00,0x64,0x13,0x00
 .byte	0x00,0x01,0x11,0x00
+.byte	0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_blst_sha256_block_data_order_epilogue:
 .byte	1,0,5,11
 .byte	0x00,0x74,0x01,0x00
 .byte	0x00,0x64,0x02,0x00
-.byte	0x00,0x03
-.byte	0x00,0x00
+.byte	0x00,0xb3
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 
diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s
index ed28b781d4c..d65df5d0d4d 100644
--- a/crypto/blst_src/build/coff/sha256-x86_64.s
+++ b/crypto/blst_src/build/coff/sha256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .p2align	6
@@ -34,22 +35,23 @@ blst_sha256_block_data_order_shaext:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_blst_sha256_block_data_order_shaext:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
-
-
-	subq	$0x58,%rsp
 
-	movaps	%xmm6,-88(%r11)
 
-	movaps	%xmm7,-72(%r11)
+	pushq	%rbp
 
-	movaps	%xmm8,-56(%r11)
+	movq	%rsp,%rbp
 
-	movaps	%xmm9,-40(%r11)
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+.Lblst_sha256_block_data_order$2:
+	subq	$0x50,%rsp
 
-	movaps	%xmm10,-24(%r11)
+	movaps	%xmm6,-80(%rbp)
+	movaps	%xmm7,-64(%rbp)
+	movaps	%xmm8,-48(%rbp)
+	movaps	%xmm9,-32(%rbp)
+	movaps	%xmm10,-16(%rbp)
 
 .LSEH_body_blst_sha256_block_data_order_shaext:
 
@@ -254,16 +256,18 @@ blst_sha256_block_data_order_shaext:
 
 	movdqu	%xmm1,(%rdi)
 	movdqu	%xmm2,16(%rdi)
-	movaps	-88(%r11),%xmm6
-	movaps	-72(%r11),%xmm7
-	movaps	-56(%r11),%xmm8
-	movaps	-40(%r11),%xmm9
-	movaps	-24(%r11),%xmm10
-	movq	%r11,%rsp
+	movaps	-80(%rbp),%xmm6
+	movaps	-64(%rbp),%xmm7
+	movaps	-48(%rbp),%xmm8
+	movaps	-32(%rbp),%xmm9
+	movaps	-16(%rbp),%xmm10
+	movq	%rbp,%rsp
+
+	popq	%rbp
 
 .LSEH_epilogue_blst_sha256_block_data_order_shaext:
-	mov	8(%r11),%rdi
-	mov	16(%r11),%rsi
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
 
 	.byte	0xf3,0xc3
 
@@ -278,13 +282,17 @@ blst_sha256_block_data_order:
 	movq	%rsi,16(%rsp)
 	movq	%rsp,%r11
 .LSEH_begin_blst_sha256_block_data_order:
-	movq	%rcx,%rdi
-	movq	%rdx,%rsi
-	movq	%r8,%rdx
 
 
 	pushq	%rbp
 
+	movq	%rsp,%rbp
+
+	movq	%rcx,%rdi
+	movq	%rdx,%rsi
+	movq	%r8,%rdx
+	testl	$2,__blst_platform_cap(%rip)
+	jnz	.Lblst_sha256_block_data_order$2
 	pushq	%rbx
 
 	pushq	%r12
@@ -296,21 +304,16 @@ blst_sha256_block_data_order:
 	pushq	%r15
 
 	shlq	$4,%rdx
-	subq	$104,%rsp
+	subq	$88,%rsp
 
 	leaq	(%rsi,%rdx,4),%rdx
-	movq	%rdi,0(%rsp)
-
-	movq	%rdx,16(%rsp)
-	movaps	%xmm6,32(%rsp)
-
-	movaps	%xmm7,48(%rsp)
+	movq	%rdi,-64(%rbp)
 
-	movaps	%xmm8,64(%rsp)
-
-	movaps	%xmm9,80(%rsp)
-
-	movq	%rsp,%rbp
+	movq	%rdx,-48(%rbp)
+	movaps	%xmm6,-128(%rbp)
+	movaps	%xmm7,-112(%rbp)
+	movaps	%xmm8,-96(%rbp)
+	movaps	%xmm9,-80(%rbp)
 
 .LSEH_body_blst_sha256_block_data_order:
 
@@ -331,7 +334,7 @@ blst_sha256_block_data_order:
 .p2align	4
 .Lloop_ssse3:
 	movdqa	K256+256(%rip),%xmm7
-	movq	%rsi,8(%rbp)
+	movq	%rsi,-56(%rbp)
 	movdqu	0(%rsi),%xmm0
 	movdqu	16(%rsi),%xmm1
 	movdqu	32(%rsi),%xmm2
@@ -1356,9 +1359,9 @@ blst_sha256_block_data_order:
 	addl	%r15d,%eax
 	movl	%r8d,%r13d
 	addl	%eax,%r14d
-	movq	0(%rbp),%rdi
+	movq	-64(%rbp),%rdi
 	movl	%r14d,%eax
-	movq	8(%rbp),%rsi
+	movq	-56(%rbp),%rsi
 
 	addl	0(%rdi),%eax
 	addl	4(%rdi),%ebx
@@ -1370,7 +1373,7 @@ blst_sha256_block_data_order:
 	addl	28(%rdi),%r11d
 
 	leaq	64(%rsi),%rsi
-	cmpq	16(%rbp),%rsi
+	cmpq	-48(%rbp),%rsi
 
 	movl	%eax,0(%rdi)
 	movl	%ebx,4(%rdi)
@@ -1383,33 +1386,27 @@ blst_sha256_block_data_order:
 	jb	.Lloop_ssse3
 
 	xorps	%xmm0,%xmm0
-	leaq	104+48(%rbp),%r11
-
 	movaps	%xmm0,0(%rsp)
 	movaps	%xmm0,16(%rsp)
 	movaps	%xmm0,32(%rsp)
 	movaps	%xmm0,48(%rsp)
-	movaps	32(%rbp),%xmm6
-	movaps	48(%rbp),%xmm7
-	movaps	64(%rbp),%xmm8
-	movaps	80(%rbp),%xmm9
-	movq	104(%rbp),%r15
-
-	movq	-40(%r11),%r14
-
-	movq	-32(%r11),%r13
-
-	movq	-24(%r11),%r12
-
-	movq	-16(%r11),%rbx
-
-	movq	-8(%r11),%rbp
+	movaps	-128(%rbp),%xmm6
+	movaps	-112(%rbp),%xmm7
+	movaps	-96(%rbp),%xmm8
+	movaps	-80(%rbp),%xmm9
+	movq	-40(%rbp),%r15
+	movq	-32(%rbp),%r14
+	movq	-24(%rbp),%r13
+	movq	-16(%rbp),%r12
+	movq	-8(%rbp),%rbx
+	movq	%rbp,%rsp
+
+	popq	%rbp
 
 .LSEH_epilogue_blst_sha256_block_data_order:
-	mov	8(%r11),%rdi
-	mov	16(%r11),%rsi
+	mov	8(%rsp),%rdi
+	mov	16(%rsp),%rsi
 
-	leaq	(%r11),%rsp
 	.byte	0xf3,0xc3
 
 .LSEH_end_blst_sha256_block_data_order:
@@ -1506,13 +1503,14 @@ blst_sha256_hcopy:
 .section	.xdata
 .p2align	3
 .LSEH_info_blst_sha256_block_data_order_shaext_prologue:
-.byte	1,0,5,0x0b
-.byte	0,0x74,1,0
-.byte	0,0x64,2,0
-.byte	0,0x03
-.byte	0,0
+.byte	1,4,6,0x05
+.byte	4,0x74,2,0
+.byte	4,0x64,3,0
+.byte	4,0x53
+.byte	1,0x50
+.long	0,0
 .LSEH_info_blst_sha256_block_data_order_shaext_body:
-.byte	1,0,15,0
+.byte	1,0,17,85
 .byte	0x00,0x68,0x00,0x00
 .byte	0x00,0x78,0x01,0x00
 .byte	0x00,0x88,0x02,0x00
@@ -1520,41 +1518,45 @@ blst_sha256_hcopy:
 .byte	0x00,0xa8,0x04,0x00
 .byte	0x00,0x74,0x0c,0x00
 .byte	0x00,0x64,0x0d,0x00
-.byte	0x00,0xa2
+.byte	0x00,0x53
+.byte	0x00,0x92
+.byte	0x00,0x50
 .byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_blst_sha256_block_data_order_shaext_epilogue:
-.byte	1,0,5,11
+.byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
 .byte	0x00,0x64,0x02,0x00
-.byte	0x00,0x03
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 
 .LSEH_info_blst_sha256_block_data_order_prologue:
-.byte	1,0,5,0x0b
-.byte	0,0x74,1,0
-.byte	0,0x64,2,0
-.byte	0,0x03
-.byte	0,0
+.byte	1,4,6,0x05
+.byte	4,0x74,2,0
+.byte	4,0x64,3,0
+.byte	4,0x53
+.byte	1,0x50
+.long	0,0
 .LSEH_info_blst_sha256_block_data_order_body:
-.byte	1,0,26,5
-.byte	0x00,0x68,0x02,0x00
-.byte	0x00,0x78,0x03,0x00
-.byte	0x00,0x88,0x04,0x00
-.byte	0x00,0x98,0x05,0x00
-.byte	0x00,0xf4,0x0d,0x00
-.byte	0x00,0xe4,0x0e,0x00
-.byte	0x00,0xd4,0x0f,0x00
-.byte	0x00,0xc4,0x10,0x00
-.byte	0x00,0x34,0x11,0x00
-.byte	0x00,0x74,0x14,0x00
-.byte	0x00,0x64,0x15,0x00
-.byte	0x00,0x03
-.byte	0x00,0x01,0x12,0x00
+.byte	1,0,25,133
+.byte	0x00,0x68,0x00,0x00
+.byte	0x00,0x78,0x01,0x00
+.byte	0x00,0x88,0x02,0x00
+.byte	0x00,0x98,0x03,0x00
+.byte	0x00,0xf4,0x0b,0x00
+.byte	0x00,0xe4,0x0c,0x00
+.byte	0x00,0xd4,0x0d,0x00
+.byte	0x00,0xc4,0x0e,0x00
+.byte	0x00,0x34,0x0f,0x00
+.byte	0x00,0x74,0x12,0x00
+.byte	0x00,0x64,0x13,0x00
+.byte	0x00,0x53
+.byte	0x00,0xf2
 .byte	0x00,0x50
+.byte	0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 .LSEH_info_blst_sha256_block_data_order_epilogue:
-.byte	1,0,5,11
+.byte	1,0,4,0
 .byte	0x00,0x74,0x01,0x00
 .byte	0x00,0x64,0x02,0x00
-.byte	0x00,0x03
-.byte	0x00,0x00
+.byte	0x00,0x00,0x00,0x00
 
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
index 347eb315f40..0c5ac5b882d 100644
--- a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_inverse_mod_256
+.hidden	ct_inverse_mod_256
 .type	ct_inverse_mod_256, %function
 .align	5
 ct_inverse_mod_256:
@@ -60,14 +61,14 @@ ct_inverse_mod_256:
 	madd	x4, x16, x8, xzr	// |u|*|f0|
 	madd	x4, x17, x9, x4	// |v|*|g0|
 	str	x4, [x0,#8*4]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*5]
 	stp	x5, x5, [x0,#8*7]
 
 	madd	x4, x12, x8, xzr	// |u|*|f1|
 	madd	x4, x13, x9, x4	// |v|*|g1|
 	str	x4, [x0,#8*9]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*10]
 	stp	x5, x5, [x0,#8*12]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
index c4d8d6d3700..0f0ca4923d7 100644
--- a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ct_inverse_mod_256
+.hidden	ct_inverse_mod_256
 .type	ct_inverse_mod_256,@function
 .align	32
 ct_inverse_mod_256:
diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
index d7eca17073c..99bb9def767 100644
--- a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
+++ b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_inverse_mod_383
+.hidden	ct_inverse_mod_383
 .type	ct_inverse_mod_383, %function
 .align	5
 ct_inverse_mod_383:
@@ -71,7 +72,7 @@ ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*6]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*8]
 	stp	x5, x5, [x0,#8*10]
 
@@ -82,7 +83,7 @@ ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*12]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*14]
 	stp	x5, x5, [x0,#8*16]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
index 3f1390ed9dc..07dd99a8af3 100644
--- a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
+++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	ct_is_square_mod_384
+.hidden	ct_is_square_mod_384
 .type	ct_is_square_mod_384, %function
 .align	5
 ct_is_square_mod_384:
diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
index fec1493cb12..bf610fa7440 100644
--- a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
+++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ct_is_square_mod_384
+.hidden	ct_is_square_mod_384
 .type	ct_is_square_mod_384,@function
 .align	32
 ct_is_square_mod_384:
diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
index b702262f6e5..9cca518721f 100644
--- a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s
@@ -1,6 +1,8 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	ct_inverse_mod_383
+.hidden	ct_inverse_mod_383
 .type	ct_inverse_mod_383,@function
 .align	32
 ct_inverse_mod_383:
@@ -8,6 +10,10 @@ ct_inverse_mod_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	ct_inverse_mod_383$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
index 25a5fa5345f..9f4d12babd4 100644
--- a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	ctx_inverse_mod_383
+.hidden	ctx_inverse_mod_383
 .type	ctx_inverse_mod_383,@function
 .align	32
 ctx_inverse_mod_383:
@@ -8,6 +9,7 @@ ctx_inverse_mod_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+ct_inverse_mod_383$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -810,7 +812,7 @@ ctx_inverse_mod_383:
 
 	movq	48(%rsi),%r10
 
-	call	__inner_loop_62
+	call	__tail_loop_53
 
 
 
@@ -1521,9 +1523,9 @@ __inner_loop_31:
 .cfi_endproc
 .size	__inner_loop_31,.-__inner_loop_31
 
-.type	__inner_loop_62,@function
+.type	__tail_loop_53,@function
 .align	32
-__inner_loop_62:
+__tail_loop_53:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1532,7 +1534,7 @@ __inner_loop_62:
 	xorq	%r12,%r12
 	movq	$1,%r13
 
-.Loop_62:
+.Loop_53:
 	xorq	%rax,%rax
 	testq	$1,%r8
 	movq	%r10,%rbx
@@ -1559,11 +1561,11 @@ __inner_loop_62:
 	subq	%rax,%rdx
 	subq	%rbx,%rcx
 	subl	$1,%edi
-	jnz	.Loop_62
+	jnz	.Loop_53
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__inner_loop_62,.-__inner_loop_62
+.size	__tail_loop_53,.-__tail_loop_53
 
 .section	.note.GNU-stack,"",@progbits
 .section	.note.gnu.property,"a",@note
diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S
index a2b1d676a36..37621bee415 100644
--- a/crypto/blst_src/build/elf/div3w-armv8.S
+++ b/crypto/blst_src/build/elf/div3w-armv8.S
@@ -25,7 +25,7 @@ div_3_limbs:
 	asr	x3,x0,#63	// top bit -> mask
 	add	x0,x0,x0	// Q <<= 1
 	subs	x6,x4,x1	// R - D
-	add	x0,x0,#1	// Q + specilative bit
+	add	x0,x0,#1	// Q + speculative bit
 	sbcs	x7,x5,x2
 	sbc	x0,x0,xzr	// subtract speculative bit
 
diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s
index 00ae5699824..5d9fd8a9139 100644
--- a/crypto/blst_src/build/elf/div3w-x86_64.s
+++ b/crypto/blst_src/build/elf/div3w-x86_64.s
@@ -8,6 +8,8 @@ div_3_limbs:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	(%rdi),%r8
 	movq	8(%rdi),%r9
 	xorq	%rax,%rax
@@ -39,8 +41,9 @@ div_3_limbs:
 
 	orq	%rcx,%rax
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 .size	div_3_limbs,.-div_3_limbs
 .globl	quot_rem_128
 .hidden	quot_rem_128
@@ -50,6 +53,8 @@ quot_rem_128:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	%rdx,%rax
 	movq	%rdx,%rcx
 
@@ -84,8 +89,9 @@ quot_rem_128:
 
 	movq	%rcx,%rax
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 .size	quot_rem_128,.-quot_rem_128
 
 
@@ -100,6 +106,8 @@ quot_rem_64:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	%rdx,%rax
 	imulq	0(%rsi),%rdx
 
@@ -110,8 +118,9 @@ quot_rem_64:
 	movq	%r10,0(%rdi)
 	movq	%rax,8(%rdi)
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 .size	quot_rem_64,.-quot_rem_64
 
 .section	.note.GNU-stack,"",@progbits
diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
index 37abd4392d3..10b1b56cb50 100644
--- a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
+++ b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	mul_mont_sparse_256
@@ -9,6 +10,10 @@ mul_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_sparse_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -73,6 +78,10 @@ sqr_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_sparse_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -419,6 +428,10 @@ from_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	from_mont_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -494,6 +507,10 @@ redc_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	redc_mont_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
index fa9dd3529ad..903ba23b12c 100644
--- a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
+++ b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 
@@ -6,9 +7,9 @@
 
 
 
-.type	__sub_mod_384x384,@function
+.type	__subq_mod_384x384,@function
 .align	32
-__sub_mod_384x384:
+__subq_mod_384x384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -73,11 +74,11 @@ __sub_mod_384x384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__sub_mod_384x384,.-__sub_mod_384x384
+.size	__subq_mod_384x384,.-__subq_mod_384x384
 
-.type	__add_mod_384,@function
+.type	__addq_mod_384,@function
 .align	32
-__add_mod_384:
+__addq_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -125,11 +126,11 @@ __add_mod_384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__add_mod_384,.-__add_mod_384
+.size	__addq_mod_384,.-__addq_mod_384
 
-.type	__sub_mod_384,@function
+.type	__subq_mod_384,@function
 .align	32
-__sub_mod_384:
+__subq_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -140,7 +141,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subq_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -177,7 +178,7 @@ __sub_mod_384_a_is_loaded:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__sub_mod_384,.-__sub_mod_384
+.size	__subq_mod_384,.-__subq_mod_384
 .globl	mul_mont_384x
 .hidden	mul_mont_384x
 .type	mul_mont_384x,@function
@@ -187,6 +188,10 @@ mul_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -232,12 +237,12 @@ mul_mont_384x:
 	movq	8(%rsp),%rcx
 	leaq	-48(%rsi),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -247,17 +252,17 @@ mul_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	movq	%rcx,%rbx
 
@@ -266,14 +271,14 @@ mul_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -303,6 +308,10 @@ sqr_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -333,13 +342,13 @@ sqr_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subq_mod_384
 
 
 	movq	16(%rsp),%rsi
@@ -427,6 +436,10 @@ mul_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -521,18 +534,18 @@ mul_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -562,6 +575,10 @@ sqr_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -617,7 +634,7 @@ sqr_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subq_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -695,6 +712,10 @@ mul_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1019,6 +1040,10 @@ sqr_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1265,6 +1290,10 @@ sqr_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1299,7 +1328,7 @@ sqr_mont_384:
 	movq	104(%rsp),%rbx
 	movq	112(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	120(%rsp),%r8
 	movq	120(%rsp),%r15
@@ -1332,6 +1361,10 @@ redc_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	redc_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1356,7 +1389,7 @@ redc_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movq	8(%rsp),%r15
 .cfi_restore	%r15
@@ -1389,6 +1422,10 @@ from_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	from_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1762,9 +1799,9 @@ __mulq_by_1_mont_384:
 .cfi_endproc
 .size	__mulq_by_1_mont_384,.-__mulq_by_1_mont_384
 
-.type	__redc_tail_mont_384,@function
+.type	__redq_tail_mont_384,@function
 .align	32
-__redc_tail_mont_384:
+__redq_tail_mont_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1809,7 +1846,7 @@ __redc_tail_mont_384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+.size	__redq_tail_mont_384,.-__redq_tail_mont_384
 
 .globl	sgn0_pty_mont_384
 .hidden	sgn0_pty_mont_384
@@ -1820,6 +1857,10 @@ sgn0_pty_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1898,6 +1939,10 @@ sgn0_pty_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2025,6 +2070,10 @@ mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	mul_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2689,6 +2738,10 @@ sqr_n_mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2727,7 +2780,7 @@ sqr_n_mul_mont_384:
 	movq	0(%rsp),%rcx
 	movq	16(%rsp),%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movd	%xmm1,%edx
 	leaq	0(%rdi),%rsi
@@ -2777,6 +2830,10 @@ sqr_n_mul_mont_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_383$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -3438,6 +3495,10 @@ sqr_mont_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,__blst_platform_cap(%rip)
+	jnz	sqr_mont_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
index 20a02073246..42e89134cff 100644
--- a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
+++ b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s
@@ -9,6 +9,7 @@ mulx_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_sparse_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -71,6 +72,7 @@ sqrx_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_sparse_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -332,6 +334,7 @@ fromx_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+from_mont_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -407,6 +410,7 @@ redcx_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+redc_mont_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
index 9f9f7404ee4..5c67d918d22 100644
--- a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
+++ b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s
@@ -6,9 +6,9 @@
 
 
 
-.type	__sub_mod_384x384,@function
+.type	__subx_mod_384x384,@function
 .align	32
-__sub_mod_384x384:
+__subx_mod_384x384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -73,11 +73,11 @@ __sub_mod_384x384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__sub_mod_384x384,.-__sub_mod_384x384
+.size	__subx_mod_384x384,.-__subx_mod_384x384
 
-.type	__add_mod_384,@function
+.type	__addx_mod_384,@function
 .align	32
-__add_mod_384:
+__addx_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -125,11 +125,11 @@ __add_mod_384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__add_mod_384,.-__add_mod_384
+.size	__addx_mod_384,.-__addx_mod_384
 
-.type	__sub_mod_384,@function
+.type	__subx_mod_384,@function
 .align	32
-__sub_mod_384:
+__subx_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -140,7 +140,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subx_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -177,7 +177,7 @@ __sub_mod_384_a_is_loaded:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__sub_mod_384,.-__sub_mod_384
+.size	__subx_mod_384,.-__subx_mod_384
 .globl	mulx_mont_384x
 .hidden	mulx_mont_384x
 .type	mulx_mont_384x,@function
@@ -187,6 +187,7 @@ mulx_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -233,12 +234,12 @@ mulx_mont_384x:
 	leaq	(%rbx),%rsi
 	leaq	-48(%rbx),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -248,17 +249,17 @@ mulx_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rcx),%rbx
 
@@ -267,14 +268,14 @@ mulx_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -304,6 +305,7 @@ sqrx_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -335,13 +337,13 @@ sqrx_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subx_mod_384
 
 
 	movq	24(%rsp),%rsi
@@ -439,6 +441,7 @@ mulx_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -533,18 +536,18 @@ mulx_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -574,6 +577,7 @@ sqrx_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -629,7 +633,7 @@ sqrx_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subx_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -707,6 +711,7 @@ mulx_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -933,6 +938,7 @@ sqrx_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1127,6 +1133,7 @@ redcx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+redc_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1151,7 +1158,7 @@ redcx_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	movq	8(%rsp),%r15
 .cfi_restore	%r15
@@ -1184,6 +1191,7 @@ fromx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+from_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1448,9 +1456,9 @@ __mulx_by_1_mont_384:
 .cfi_endproc
 .size	__mulx_by_1_mont_384,.-__mulx_by_1_mont_384
 
-.type	__redc_tail_mont_384,@function
+.type	__redx_tail_mont_384,@function
 .align	32
-__redc_tail_mont_384:
+__redx_tail_mont_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1495,7 +1503,7 @@ __redc_tail_mont_384:
 
 	.byte	0xf3,0xc3
 .cfi_endproc
-.size	__redc_tail_mont_384,.-__redc_tail_mont_384
+.size	__redx_tail_mont_384,.-__redx_tail_mont_384
 
 .globl	sgn0x_pty_mont_384
 .hidden	sgn0x_pty_mont_384
@@ -1506,6 +1514,7 @@ sgn0x_pty_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sgn0_pty_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1584,6 +1593,7 @@ sgn0x_pty_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sgn0_pty_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1711,6 +1721,7 @@ mulx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2178,6 +2189,7 @@ sqrx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2245,6 +2257,7 @@ sqrx_n_mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_n_mul_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2330,6 +2343,7 @@ sqrx_n_mul_mont_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_n_mul_mont_383$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2776,6 +2790,7 @@ sqrx_mont_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S
index 7341decf4f5..45c1162c467 100644
--- a/crypto/blst_src/build/elf/sha256-armv8.S
+++ b/crypto/blst_src/build/elf/sha256-armv8.S
@@ -10,11 +10,12 @@
 //
 // sha256_block procedure for ARMv8.
 //
-// This module is stripped of scalar code paths, with raionale that all
+// This module is stripped of scalar code paths, with rationale that all
 // known processors are NEON-capable.
 //
 // See original module at CRYPTOGAMS for further details.
 
+.comm	__blst_platform_cap,4
 .text
 
 .align	6
@@ -184,6 +185,11 @@ blst_sha256_block_armv8:
 .type	blst_sha256_block_data_order,%function
 .align	4
 blst_sha256_block_data_order:
+	adrp	x16,__blst_platform_cap
+	ldr	w16,[x16,#:lo12:__blst_platform_cap]
+	tst	w16,#1
+	b.ne	.Lv8_entry
+
 	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 	sub	sp,sp,#16*4
diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s
index 20b5c411306..2fd6a770917 100644
--- a/crypto/blst_src/build/elf/sha256-portable-x86_64.s
+++ b/crypto/blst_src/build/elf/sha256-portable-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .globl	blst_sha256_block_data_order
@@ -8,33 +9,35 @@ blst_sha256_block_data_order:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
-	pushq	%rbx
-.cfi_adjust_cfa_offset	8
-.cfi_offset	%rbx,-16
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
-.cfi_offset	%rbp,-24
+.cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+#ifdef __BLST_PORTABLE__
+	testl	$2,__blst_platform_cap(%rip)
+	jnz	.Lblst_sha256_block_data_order$2
+#endif
+	pushq	%rbx
+.cfi_offset	%rbx,-24
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 	shlq	$4,%rdx
 	subq	$64+24,%rsp
-.cfi_adjust_cfa_offset	16*4+3*8
+
+.cfi_def_cfa	%rsp,144
+
 	leaq	(%rsi,%rdx,4),%rdx
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
 
-
 	movl	0(%rdi),%eax
 	movl	4(%rdi),%ebx
 	movl	8(%rdi),%ecx
@@ -1636,23 +1639,23 @@ blst_sha256_block_data_order:
 	leaq	64+24+48(%rsp),%r11
 .cfi_def_cfa	%r11,8
 	movq	64+24(%rsp),%r15
-.cfi_restore	%r15
 	movq	-40(%r11),%r14
-.cfi_restore	%r14
 	movq	-32(%r11),%r13
-.cfi_restore	%r13
 	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbx
+	movq	-8(%r11),%rbp
 .cfi_restore	%r12
-	movq	-16(%r11),%rbp
+.cfi_restore	%r13
+.cfi_restore	%r14
+.cfi_restore	%r15
 .cfi_restore	%rbp
-	movq	-8(%r11),%rbx
 .cfi_restore	%rbx
-
 	leaq	(%r11),%rsp
 	.byte	0xf3,0xc3
 .cfi_endproc	
 .size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
 
+#ifndef __BLST_PORTABLE__
 .align	64
 .type	K256,@object
 K256:
@@ -1744,6 +1747,7 @@ blst_sha256_hcopy:
 	.byte	0xf3,0xc3
 .cfi_endproc
 .size	blst_sha256_hcopy,.-blst_sha256_hcopy
+#endif
 
 .section	.note.GNU-stack,"",@progbits
 .section	.note.gnu.property,"a",@note
diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s
index 47fdc5bc57a..940051aab16 100644
--- a/crypto/blst_src/build/elf/sha256-x86_64.s
+++ b/crypto/blst_src/build/elf/sha256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	__blst_platform_cap,4
 .text	
 
 .align	64
@@ -33,6 +34,13 @@ blst_sha256_block_data_order_shaext:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+.Lblst_sha256_block_data_order$2:
+
 	leaq	K256+128(%rip),%rcx
 	movdqu	(%rdi),%xmm1
 	movdqu	16(%rdi),%xmm2
@@ -234,6 +242,11 @@ blst_sha256_block_data_order_shaext:
 
 	movdqu	%xmm1,(%rdi)
 	movdqu	%xmm2,16(%rdi)
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+
 	.byte	0xf3,0xc3
 .cfi_endproc	
 .size	blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext
@@ -249,30 +262,27 @@ blst_sha256_block_data_order:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	testl	$2,__blst_platform_cap(%rip)
+	jnz	.Lblst_sha256_block_data_order$2
 	pushq	%rbx
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 	shlq	$4,%rdx
-	subq	$40,%rsp
-.cfi_adjust_cfa_offset	40
+	subq	$24,%rsp
+
 	leaq	(%rsi,%rdx,4),%rdx
-	movq	%rdi,0(%rsp)
+	movq	%rdi,-64(%rbp)
 
-	movq	%rdx,16(%rsp)
-	movq	%rsp,%rbp
-.cfi_def_cfa_register	%rbp
+	movq	%rdx,-48(%rbp)
 
 
 	leaq	-64(%rsp),%rsp
@@ -291,7 +301,7 @@ blst_sha256_block_data_order:
 .align	16
 .Lloop_ssse3:
 	movdqa	K256+256(%rip),%xmm7
-	movq	%rsi,8(%rbp)
+	movq	%rsi,-56(%rbp)
 	movdqu	0(%rsi),%xmm0
 	movdqu	16(%rsi),%xmm1
 	movdqu	32(%rsi),%xmm2
@@ -1316,9 +1326,9 @@ blst_sha256_block_data_order:
 	addl	%r15d,%eax
 	movl	%r8d,%r13d
 	addl	%eax,%r14d
-	movq	0(%rbp),%rdi
+	movq	-64(%rbp),%rdi
 	movl	%r14d,%eax
-	movq	8(%rbp),%rsi
+	movq	-56(%rbp),%rsi
 
 	addl	0(%rdi),%eax
 	addl	4(%rdi),%ebx
@@ -1330,7 +1340,7 @@ blst_sha256_block_data_order:
 	addl	28(%rdi),%r11d
 
 	leaq	64(%rsi),%rsi
-	cmpq	16(%rbp),%rsi
+	cmpq	-48(%rbp),%rsi
 
 	movl	%eax,0(%rdi)
 	movl	%ebx,4(%rdi)
@@ -1343,26 +1353,25 @@ blst_sha256_block_data_order:
 	jb	.Lloop_ssse3
 
 	xorps	%xmm0,%xmm0
-	leaq	40+48(%rbp),%r11
-.cfi_def_cfa	%r11,8
 	movaps	%xmm0,0(%rsp)
 	movaps	%xmm0,16(%rsp)
 	movaps	%xmm0,32(%rsp)
 	movaps	%xmm0,48(%rsp)
-	movq	40(%rbp),%r15
-.cfi_restore	%r15
-	movq	-40(%r11),%r14
-.cfi_restore	%r14
-	movq	-32(%r11),%r13
-.cfi_restore	%r13
-	movq	-24(%r11),%r12
+	movq	-40(%rbp),%r15
+	movq	-32(%rbp),%r14
+	movq	-24(%rbp),%r13
+	movq	-16(%rbp),%r12
+	movq	-8(%rbp),%rbx
+	movq	%rbp,%rsp
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
 .cfi_restore	%r12
-	movq	-16(%r11),%rbx
+.cfi_restore	%r13
+.cfi_restore	%r14
+.cfi_restore	%r15
 .cfi_restore	%rbx
-	movq	-8(%r11),%rbp
-.cfi_restore	%rbp
-
-	leaq	(%r11),%rsp
 	.byte	0xf3,0xc3
 .cfi_endproc	
 .size	blst_sha256_block_data_order,.-blst_sha256_block_data_order
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
index f3a2c3b5f11..2fd4847a496 100644
--- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	_ct_inverse_mod_256
+.private_extern	_ct_inverse_mod_256
 
 .align	5
 _ct_inverse_mod_256:
@@ -60,14 +61,14 @@ _ct_inverse_mod_256:
 	madd	x4, x16, x8, xzr	// |u|*|f0|
 	madd	x4, x17, x9, x4	// |v|*|g0|
 	str	x4, [x0,#8*4]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*5]
 	stp	x5, x5, [x0,#8*7]
 
 	madd	x4, x12, x8, xzr	// |u|*|f1|
 	madd	x4, x13, x9, x4	// |v|*|g1|
 	str	x4, [x0,#8*9]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*10]
 	stp	x5, x5, [x0,#8*12]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
index b6441da6e1f..bf0ad8986e7 100644
--- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	_ct_inverse_mod_256
+.private_extern	_ct_inverse_mod_256
 
 .p2align	5
 _ct_inverse_mod_256:
diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
index c7d9ba8488e..b9c3acde200 100644
--- a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
+++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	_ct_inverse_mod_383
+.private_extern	_ct_inverse_mod_383
 
 .align	5
 _ct_inverse_mod_383:
@@ -71,7 +72,7 @@ _ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*6]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*8]
 	stp	x5, x5, [x0,#8*10]
 
@@ -82,7 +83,7 @@ _ct_inverse_mod_383:
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*12]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*14]
 	stp	x5, x5, [x0,#8*16]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
index b5c953d287a..9fe0df88b59 100644
--- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
+++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S
@@ -1,6 +1,7 @@
 .text
 
 .globl	_ct_is_square_mod_384
+.private_extern	_ct_is_square_mod_384
 
 .align	5
 _ct_is_square_mod_384:
diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
index f2823941167..5faadb8dbff 100644
--- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
+++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	_ct_is_square_mod_384
+.private_extern	_ct_is_square_mod_384
 
 .p2align	5
 _ct_is_square_mod_384:
diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
index 185a876b87c..eebe131d0cb 100644
--- a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s
@@ -1,6 +1,8 @@
+.comm	___blst_platform_cap,4
 .text	
 
 .globl	_ct_inverse_mod_383
+.private_extern	_ct_inverse_mod_383
 
 .p2align	5
 _ct_inverse_mod_383:
@@ -8,6 +10,10 @@ _ct_inverse_mod_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	ct_inverse_mod_383$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
index 3e05df3a4b3..3f999075813 100644
--- a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
+++ b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s
@@ -1,6 +1,7 @@
 .text	
 
 .globl	_ctx_inverse_mod_383
+.private_extern	_ctx_inverse_mod_383
 
 .p2align	5
 _ctx_inverse_mod_383:
@@ -8,6 +9,7 @@ _ctx_inverse_mod_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+ct_inverse_mod_383$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -810,7 +812,7 @@ _ctx_inverse_mod_383:
 
 	movq	48(%rsi),%r10
 
-	call	__inner_loop_62
+	call	__tail_loop_53
 
 
 
@@ -1523,7 +1525,7 @@ L$oop_31:
 
 
 .p2align	5
-__inner_loop_62:
+__tail_loop_53:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1532,7 +1534,7 @@ __inner_loop_62:
 	xorq	%r12,%r12
 	movq	$1,%r13
 
-L$oop_62:
+L$oop_53:
 	xorq	%rax,%rax
 	testq	$1,%r8
 	movq	%r10,%rbx
@@ -1559,7 +1561,7 @@ L$oop_62:
 	subq	%rax,%rdx
 	subq	%rbx,%rcx
 	subl	$1,%edi
-	jnz	L$oop_62
+	jnz	L$oop_53
 
 	.byte	0xf3,0xc3
 .cfi_endproc
diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S
index 5a5eb3a01d7..4b130080123 100644
--- a/crypto/blst_src/build/mach-o/div3w-armv8.S
+++ b/crypto/blst_src/build/mach-o/div3w-armv8.S
@@ -25,7 +25,7 @@ Loop:
 	asr	x3,x0,#63	// top bit -> mask
 	add	x0,x0,x0	// Q <<= 1
 	subs	x6,x4,x1	// R - D
-	add	x0,x0,#1	// Q + specilative bit
+	add	x0,x0,#1	// Q + speculative bit
 	sbcs	x7,x5,x2
 	sbc	x0,x0,xzr	// subtract speculative bit
 
diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s
index 8075571c87d..99a94d50a2b 100644
--- a/crypto/blst_src/build/mach-o/div3w-x86_64.s
+++ b/crypto/blst_src/build/mach-o/div3w-x86_64.s
@@ -8,6 +8,8 @@ _div_3_limbs:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	(%rdi),%r8
 	movq	8(%rdi),%r9
 	xorq	%rax,%rax
@@ -39,8 +41,9 @@ L$oop:
 
 	orq	%rcx,%rax
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 
 .globl	_quot_rem_128
 .private_extern	_quot_rem_128
@@ -50,6 +53,8 @@ _quot_rem_128:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	%rdx,%rax
 	movq	%rdx,%rcx
 
@@ -84,8 +89,9 @@ _quot_rem_128:
 
 	movq	%rcx,%rax
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 
 
 
@@ -100,6 +106,8 @@ _quot_rem_64:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
+
+
 	movq	%rdx,%rax
 	imulq	0(%rsi),%rdx
 
@@ -110,6 +118,7 @@ _quot_rem_64:
 	movq	%r10,0(%rdi)
 	movq	%rax,8(%rdi)
 
+
 	.byte	0xf3,0xc3
-.cfi_endproc
+.cfi_endproc	
 
diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
index d83f5440342..842c39225b6 100644
--- a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
+++ b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	___blst_platform_cap,4
 .text	
 
 .globl	_mul_mont_sparse_256
@@ -9,6 +10,10 @@ _mul_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	mul_mont_sparse_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -73,6 +78,10 @@ _sqr_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_mont_sparse_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -419,6 +428,10 @@ _from_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	from_mont_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -494,6 +507,10 @@ _redc_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	redc_mont_256$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
index 0d8ac89cfc2..7052343d0ac 100644
--- a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
+++ b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s
@@ -1,3 +1,4 @@
+.comm	___blst_platform_cap,4
 .text	
 
 
@@ -8,7 +9,7 @@
 
 
 .p2align	5
-__sub_mod_384x384:
+__subq_mod_384x384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -77,7 +78,7 @@ __sub_mod_384x384:
 
 
 .p2align	5
-__add_mod_384:
+__addq_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -129,7 +130,7 @@ __add_mod_384:
 
 
 .p2align	5
-__sub_mod_384:
+__subq_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -140,7 +141,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subq_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -187,6 +188,10 @@ _mul_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	mul_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -232,12 +237,12 @@ _mul_mont_384x:
 	movq	8(%rsp),%rcx
 	leaq	-48(%rsi),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -247,17 +252,17 @@ _mul_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	movq	%rcx,%rbx
 
@@ -266,14 +271,14 @@ _mul_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -303,6 +308,10 @@ _sqr_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -333,13 +342,13 @@ _sqr_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addq_mod_384
 
 
 	movq	16(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subq_mod_384
 
 
 	movq	16(%rsp),%rsi
@@ -427,6 +436,10 @@ _mul_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	mul_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -521,18 +534,18 @@ _mul_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -562,6 +575,10 @@ _sqr_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -617,7 +634,7 @@ _sqr_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subq_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -695,6 +712,10 @@ _mul_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	mul_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1019,6 +1040,10 @@ _sqr_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1265,6 +1290,10 @@ _sqr_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1299,7 +1328,7 @@ _sqr_mont_384:
 	movq	104(%rsp),%rbx
 	movq	112(%rsp),%rdi
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	leaq	120(%rsp),%r8
 	movq	120(%rsp),%r15
@@ -1332,6 +1361,10 @@ _redc_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	redc_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1356,7 +1389,7 @@ _redc_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movq	8(%rsp),%r15
 .cfi_restore	%r15
@@ -1389,6 +1422,10 @@ _from_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	from_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1764,7 +1801,7 @@ __mulq_by_1_mont_384:
 
 
 .p2align	5
-__redc_tail_mont_384:
+__redq_tail_mont_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1820,6 +1857,10 @@ _sgn0_pty_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1898,6 +1939,10 @@ _sgn0_pty_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sgn0_pty_mont_384x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2025,6 +2070,10 @@ _mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	mul_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2689,6 +2738,10 @@ _sqr_n_mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_384$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2727,7 +2780,7 @@ L$oop_sqr_384:
 	movq	0(%rsp),%rcx
 	movq	16(%rsp),%rbx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movd	%xmm1,%edx
 	leaq	0(%rdi),%rsi
@@ -2777,6 +2830,10 @@ _sqr_n_mul_mont_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_n_mul_mont_383$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -3438,6 +3495,10 @@ _sqr_mont_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+#ifdef __BLST_PORTABLE__
+	testl	$1,___blst_platform_cap(%rip)
+	jnz	sqr_mont_382x$1
+#endif
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
index 178372f41b2..ae9a76b739c 100644
--- a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
+++ b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s
@@ -9,6 +9,7 @@ _mulx_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_sparse_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -71,6 +72,7 @@ _sqrx_mont_sparse_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_sparse_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -332,6 +334,7 @@ _fromx_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+from_mont_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -407,6 +410,7 @@ _redcx_mont_256:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+redc_mont_256$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
index 95d3dadcc67..c5afeec8a51 100644
--- a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
+++ b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s
@@ -8,7 +8,7 @@
 
 
 .p2align	5
-__sub_mod_384x384:
+__subx_mod_384x384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -77,7 +77,7 @@ __sub_mod_384x384:
 
 
 .p2align	5
-__add_mod_384:
+__addx_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -129,7 +129,7 @@ __add_mod_384:
 
 
 .p2align	5
-__sub_mod_384:
+__subx_mod_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -140,7 +140,7 @@ __sub_mod_384:
 	movq	32(%rsi),%r12
 	movq	40(%rsi),%r13
 
-__sub_mod_384_a_is_loaded:
+__subx_mod_384_a_is_loaded:
 	subq	0(%rdx),%r8
 	movq	0(%rcx),%r14
 	sbbq	8(%rdx),%r9
@@ -187,6 +187,7 @@ _mulx_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -233,12 +234,12 @@ _mulx_mont_384x:
 	leaq	(%rbx),%rsi
 	leaq	-48(%rbx),%rdx
 	leaq	40+192+48(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	-48(%rdi),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	leaq	(%rdi),%rbx
 	leaq	48(%rdi),%rsi
@@ -248,17 +249,17 @@ _mulx_mont_384x:
 	leaq	(%rdi),%rsi
 	leaq	40(%rsp),%rdx
 	movq	8(%rsp),%rcx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	40(%rsp),%rsi
 	leaq	40+96(%rsp),%rdx
 	leaq	40(%rsp),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	(%rcx),%rbx
 
@@ -267,14 +268,14 @@ _mulx_mont_384x:
 	movq	0(%rsp),%rcx
 	movq	32(%rsp),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 
 	leaq	40+192(%rsp),%rsi
 	movq	0(%rsp),%rcx
 	leaq	48(%rdi),%rdi
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	leaq	328(%rsp),%r8
 	movq	0(%r8),%r15
@@ -304,6 +305,7 @@ _sqrx_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -335,13 +337,13 @@ _sqrx_mont_384x:
 
 	leaq	48(%rsi),%rdx
 	leaq	32(%rsp),%rdi
-	call	__add_mod_384
+	call	__addx_mod_384
 
 
 	movq	24(%rsp),%rsi
 	leaq	48(%rsi),%rdx
 	leaq	32+48(%rsp),%rdi
-	call	__sub_mod_384
+	call	__subx_mod_384
 
 
 	movq	24(%rsp),%rsi
@@ -439,6 +441,7 @@ _mulx_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -533,18 +536,18 @@ _mulx_382x:
 	leaq	32(%rsp),%rdx
 	movq	24(%rsp),%rcx
 	movq	%rsi,%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	0(%rdi),%rsi
 	leaq	-96(%rdi),%rdx
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	leaq	-96(%rdi),%rsi
 	leaq	32(%rsp),%rdx
 	leaq	-96(%rdi),%rdi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	leaq	136(%rsp),%r8
 	movq	0(%r8),%r15
@@ -574,6 +577,7 @@ _sqrx_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -629,7 +633,7 @@ _sqrx_382x:
 
 	leaq	48(%rsi),%rdx
 	leaq	48(%rdi),%rdi
-	call	__sub_mod_384_a_is_loaded
+	call	__subx_mod_384_a_is_loaded
 
 
 	leaq	(%rdi),%rsi
@@ -707,6 +711,7 @@ _mulx_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -933,6 +938,7 @@ _sqrx_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1127,6 +1133,7 @@ _redcx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+redc_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1151,7 +1158,7 @@ _redcx_mont_384:
 
 	movq	%rdx,%rbx
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	movq	8(%rsp),%r15
 .cfi_restore	%r15
@@ -1184,6 +1191,7 @@ _fromx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+from_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1450,7 +1458,7 @@ __mulx_by_1_mont_384:
 
 
 .p2align	5
-__redc_tail_mont_384:
+__redx_tail_mont_384:
 .cfi_startproc
 	.byte	0xf3,0x0f,0x1e,0xfa
 
@@ -1506,6 +1514,7 @@ _sgn0x_pty_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sgn0_pty_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1584,6 +1593,7 @@ _sgn0x_pty_mont_384x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sgn0_pty_mont_384x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -1711,6 +1721,7 @@ _mulx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+mul_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2178,6 +2189,7 @@ _sqrx_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2245,6 +2257,7 @@ _sqrx_n_mul_mont_384:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_n_mul_mont_384$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2330,6 +2343,7 @@ _sqrx_n_mul_mont_383:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_n_mul_mont_383$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
@@ -2776,6 +2790,7 @@ _sqrx_mont_382x:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+sqr_mont_382x$1:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S
index c928f75025f..3f3c1266dcd 100644
--- a/crypto/blst_src/build/mach-o/sha256-armv8.S
+++ b/crypto/blst_src/build/mach-o/sha256-armv8.S
@@ -10,11 +10,12 @@
 //
 // sha256_block procedure for ARMv8.
 //
-// This module is stripped of scalar code paths, with raionale that all
+// This module is stripped of scalar code paths, with rationale that all
 // known processors are NEON-capable.
 //
 // See original module at CRYPTOGAMS for further details.
 
+.comm	___blst_platform_cap,4
 .text
 
 .align	6
@@ -184,6 +185,11 @@ Loop_hw:
 
 .align	4
 _blst_sha256_block_data_order:
+	adrp	x16,___blst_platform_cap@PAGE
+	ldr	w16,[x16,___blst_platform_cap@PAGEOFF]
+	tst	w16,#1
+	b.ne	Lv8_entry
+
 	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 	sub	sp,sp,#16*4
diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
index 3f000720d00..9f0a4f84ff0 100644
--- a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
+++ b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s
@@ -1,3 +1,4 @@
+.comm	___blst_platform_cap,4
 .text	
 
 .globl	_blst_sha256_block_data_order
@@ -8,33 +9,35 @@ _blst_sha256_block_data_order:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
-	pushq	%rbx
-.cfi_adjust_cfa_offset	8
-.cfi_offset	%rbx,-16
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
-.cfi_offset	%rbp,-24
+.cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+#ifdef __BLST_PORTABLE__
+	testl	$2,___blst_platform_cap(%rip)
+	jnz	L$blst_sha256_block_data_order$2
+#endif
+	pushq	%rbx
+.cfi_offset	%rbx,-24
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 	shlq	$4,%rdx
 	subq	$64+24,%rsp
-.cfi_adjust_cfa_offset	16*4+3*8
+
+.cfi_def_cfa	%rsp,144
+
 	leaq	(%rsi,%rdx,4),%rdx
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
 
-
 	movl	0(%rdi),%eax
 	movl	4(%rdi),%ebx
 	movl	8(%rdi),%ecx
@@ -1636,23 +1639,23 @@ L$rounds_16_xx:
 	leaq	64+24+48(%rsp),%r11
 .cfi_def_cfa	%r11,8
 	movq	64+24(%rsp),%r15
-.cfi_restore	%r15
 	movq	-40(%r11),%r14
-.cfi_restore	%r14
 	movq	-32(%r11),%r13
-.cfi_restore	%r13
 	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbx
+	movq	-8(%r11),%rbp
 .cfi_restore	%r12
-	movq	-16(%r11),%rbp
+.cfi_restore	%r13
+.cfi_restore	%r14
+.cfi_restore	%r15
 .cfi_restore	%rbp
-	movq	-8(%r11),%rbx
 .cfi_restore	%rbx
-
 	leaq	(%r11),%rsp
 	.byte	0xf3,0xc3
 .cfi_endproc	
 
 
+#ifndef __BLST_PORTABLE__
 .p2align	6
 
 K256:
@@ -1744,3 +1747,4 @@ _blst_sha256_hcopy:
 	.byte	0xf3,0xc3
 .cfi_endproc
 
+#endif
diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s
index dee75e35362..cff024eed4f 100644
--- a/crypto/blst_src/build/mach-o/sha256-x86_64.s
+++ b/crypto/blst_src/build/mach-o/sha256-x86_64.s
@@ -1,3 +1,4 @@
+.comm	___blst_platform_cap,4
 .text	
 
 .p2align	6
@@ -33,6 +34,13 @@ _blst_sha256_block_data_order_shaext:
 	.byte	0xf3,0x0f,0x1e,0xfa
 
 
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+L$blst_sha256_block_data_order$2:
+
 	leaq	K256+128(%rip),%rcx
 	movdqu	(%rdi),%xmm1
 	movdqu	16(%rdi),%xmm2
@@ -234,6 +242,11 @@ L$oop_shaext:
 
 	movdqu	%xmm1,(%rdi)
 	movdqu	%xmm2,16(%rdi)
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+
 	.byte	0xf3,0xc3
 .cfi_endproc	
 
@@ -249,30 +262,27 @@ _blst_sha256_block_data_order:
 	pushq	%rbp
 .cfi_adjust_cfa_offset	8
 .cfi_offset	%rbp,-16
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	%rbp
+	testl	$2,___blst_platform_cap(%rip)
+	jnz	L$blst_sha256_block_data_order$2
 	pushq	%rbx
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%rbx,-24
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r12,-32
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r13,-40
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r14,-48
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
 .cfi_offset	%r15,-56
 	shlq	$4,%rdx
-	subq	$40,%rsp
-.cfi_adjust_cfa_offset	40
+	subq	$24,%rsp
+
 	leaq	(%rsi,%rdx,4),%rdx
-	movq	%rdi,0(%rsp)
+	movq	%rdi,-64(%rbp)
 
-	movq	%rdx,16(%rsp)
-	movq	%rsp,%rbp
-.cfi_def_cfa_register	%rbp
+	movq	%rdx,-48(%rbp)
 
 
 	leaq	-64(%rsp),%rsp
@@ -291,7 +301,7 @@ _blst_sha256_block_data_order:
 .p2align	4
 L$loop_ssse3:
 	movdqa	K256+256(%rip),%xmm7
-	movq	%rsi,8(%rbp)
+	movq	%rsi,-56(%rbp)
 	movdqu	0(%rsi),%xmm0
 	movdqu	16(%rsi),%xmm1
 	movdqu	32(%rsi),%xmm2
@@ -1316,9 +1326,9 @@ L$ssse3_00_47:
 	addl	%r15d,%eax
 	movl	%r8d,%r13d
 	addl	%eax,%r14d
-	movq	0(%rbp),%rdi
+	movq	-64(%rbp),%rdi
 	movl	%r14d,%eax
-	movq	8(%rbp),%rsi
+	movq	-56(%rbp),%rsi
 
 	addl	0(%rdi),%eax
 	addl	4(%rdi),%ebx
@@ -1330,7 +1340,7 @@ L$ssse3_00_47:
 	addl	28(%rdi),%r11d
 
 	leaq	64(%rsi),%rsi
-	cmpq	16(%rbp),%rsi
+	cmpq	-48(%rbp),%rsi
 
 	movl	%eax,0(%rdi)
 	movl	%ebx,4(%rdi)
@@ -1343,26 +1353,25 @@ L$ssse3_00_47:
 	jb	L$loop_ssse3
 
 	xorps	%xmm0,%xmm0
-	leaq	40+48(%rbp),%r11
-.cfi_def_cfa	%r11,8
 	movaps	%xmm0,0(%rsp)
 	movaps	%xmm0,16(%rsp)
 	movaps	%xmm0,32(%rsp)
 	movaps	%xmm0,48(%rsp)
-	movq	40(%rbp),%r15
-.cfi_restore	%r15
-	movq	-40(%r11),%r14
-.cfi_restore	%r14
-	movq	-32(%r11),%r13
-.cfi_restore	%r13
-	movq	-24(%r11),%r12
+	movq	-40(%rbp),%r15
+	movq	-32(%rbp),%r14
+	movq	-24(%rbp),%r13
+	movq	-16(%rbp),%r12
+	movq	-8(%rbp),%rbx
+	movq	%rbp,%rsp
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
 .cfi_restore	%r12
-	movq	-16(%r11),%rbx
+.cfi_restore	%r13
+.cfi_restore	%r14
+.cfi_restore	%r15
 .cfi_restore	%rbx
-	movq	-8(%r11),%rbp
-.cfi_restore	%rbp
-
-	leaq	(%r11),%rsp
 	.byte	0xf3,0xc3
 .cfi_endproc	
 
diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh
new file mode 100755
index 00000000000..56b0b279c69
--- /dev/null
+++ b/crypto/blst_src/build/refresh.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+
+HERE=`dirname $0`
+cd "${HERE}"
+
+PERL=${PERL:-perl}
+
+for pl in ../src/asm/*-x86_64.pl; do
+    s=`basename $pl .pl`.asm
+    expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s)
+    s=`basename $pl .pl`.s
+    (set -x; ${PERL} $pl elf > elf/$s)
+    (set -x; ${PERL} $pl mingw64 > coff/$s)
+    (set -x; ${PERL} $pl macosx > mach-o/$s)
+done
+
+for pl in ../src/asm/*-armv8.pl; do
+    s=`basename $pl .pl`.asm
+    (set -x; ${PERL} $pl win64 > win64/$s)
+    s=`basename $pl .pl`.S
+    (set -x; ${PERL} $pl linux64 > elf/$s)
+    (set -x; ${PERL} $pl coff64 > coff/$s)
+    (set -x; ${PERL} $pl ios64 > mach-o/$s)
+done
+
+( cd ../bindings;
+  echo "LIBRARY blst"
+  echo
+  echo "EXPORTS"
+  cc -E blst.h | \
+  ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) &&  print "\t$1\n" }'
+  echo
+) > win64/blst.def
+
+if which bindgen > /dev/null 2>&1; then
+  ( cd ../bindings; set -x;
+    bindgen --opaque-type blst_pairing \
+            --opaque-type blst_uniq \
+            --with-derive-default \
+            --with-derive-eq \
+            --rustified-enum BLST.\* \
+        blst.h -- -D__BLST_RUST_BINDGEN__ \
+    | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs
+  )
+else
+    echo "Install Rust bindgen with 'cargo install bindgen-cli'" 1>&2
+    exit 1
+fi
diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm
index 09a5c17975d..d5308b8f809 100644
--- a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm
+++ b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm
@@ -11,15 +11,14 @@ add_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_add_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	sub	rsp,8
@@ -84,14 +83,13 @@ mul_by_3_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_3_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -130,6 +128,7 @@ mul_by_3_mod_256	ENDP
 ALIGN	32
 __lshift_mod_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	add	r8,r8
 	adc	r9,r9
 	mov	rax,r8
@@ -165,15 +164,14 @@ lshift_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_lshift_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -224,15 +222,14 @@ rshift_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_rshift_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	sub	rsp,8
@@ -315,15 +312,14 @@ cneg_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_cneg_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -395,15 +391,14 @@ sub_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sub_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	sub	rsp,8
@@ -466,11 +461,10 @@ check_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_check_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
+	mov	rdi,rcx
+	mov	rsi,rdx
 	mov	rax,QWORD PTR[rdi]
 	mov	r9,QWORD PTR[8+rdi]
 	mov	r10,QWORD PTR[16+rdi]
@@ -511,15 +505,14 @@ add_n_check_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_add_n_check_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	sub	rsp,8
@@ -589,15 +582,14 @@ sub_n_check_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sub_n_check_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	sub	rsp,8
@@ -764,8 +756,9 @@ $L$SEH_info_add_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_add_mod_256_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -773,7 +766,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_add_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -784,8 +778,9 @@ $L$SEH_info_mul_by_3_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_3_mod_256_body::
 DB	1,0,11,0
 DB	000h,0c4h,000h,000h
@@ -805,8 +800,9 @@ $L$SEH_info_lshift_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_lshift_mod_256_body::
 DB	1,0,11,0
 DB	000h,0c4h,000h,000h
@@ -826,8 +822,9 @@ $L$SEH_info_rshift_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_rshift_mod_256_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -835,7 +832,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_rshift_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -846,8 +844,9 @@ $L$SEH_info_cneg_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_cneg_mod_256_body::
 DB	1,0,11,0
 DB	000h,0c4h,000h,000h
@@ -867,8 +866,9 @@ $L$SEH_info_sub_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sub_mod_256_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -876,7 +876,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sub_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -893,8 +894,9 @@ $L$SEH_info_add_n_check_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_add_n_check_mod_256_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -902,7 +904,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_add_n_check_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -913,8 +916,9 @@ $L$SEH_info_sub_n_check_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sub_n_check_mod_256_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -922,7 +926,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sub_n_check_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm
index 8a7b9e255db..560e02ee105 100644
--- a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm
@@ -11,15 +11,14 @@ add_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_add_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -64,6 +63,7 @@ add_mod_384	ENDP
 ALIGN	32
 __add_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -120,15 +120,14 @@ add_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_add_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -190,15 +189,14 @@ rshift_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_rshift_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -260,6 +258,7 @@ rshift_mod_384	ENDP
 ALIGN	32
 __rshift_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rsi,1
 	mov	r14,QWORD PTR[rcx]
 	and	rsi,r8
@@ -320,14 +319,13 @@ div_by_2_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_div_by_2_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -394,15 +392,14 @@ lshift_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_lshift_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -493,6 +490,7 @@ lshift_mod_384	ENDP
 ALIGN	32
 __lshift_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	add	r8,r8
 	adc	r9,r9
 	adc	r10,r10
@@ -536,14 +534,13 @@ mul_by_3_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_3_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -605,14 +602,13 @@ mul_by_8_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_8_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -681,14 +677,13 @@ mul_by_3_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_3_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -766,14 +761,13 @@ mul_by_8_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_8_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -861,15 +855,14 @@ cneg_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_cneg_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -970,15 +963,14 @@ sub_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sub_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -1023,6 +1015,7 @@ sub_mod_384	ENDP
 ALIGN	32
 __sub_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -1077,15 +1070,14 @@ sub_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sub_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -1145,14 +1137,13 @@ mul_by_1_plus_i_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_by_1_plus_i_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
 	push	rbx
 
 	push	r12
@@ -1297,11 +1288,10 @@ sgn0_pty_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0_pty_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
+	mov	rdi,rcx
+	mov	rsi,rdx
 $L$SEH_body_sgn0_pty_mod_384::
 
 	mov	r8,QWORD PTR[rdi]
@@ -1353,13 +1343,12 @@ sgn0_pty_mod_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0_pty_mod_384x::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
 	push	rbx
 
 	sub	rsp,8
@@ -1472,6 +1461,7 @@ PUBLIC	vec_select_32
 ALIGN	32
 vec_select_32	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1500,6 +1490,7 @@ PUBLIC	vec_select_48
 ALIGN	32
 vec_select_48	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1534,6 +1525,7 @@ PUBLIC	vec_select_96
 ALIGN	32
 vec_select_96	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1586,6 +1578,7 @@ PUBLIC	vec_select_192
 ALIGN	32
 vec_select_192	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1674,6 +1667,7 @@ PUBLIC	vec_select_144
 ALIGN	32
 vec_select_144	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1744,6 +1738,7 @@ PUBLIC	vec_select_288
 ALIGN	32
 vec_select_288	PROC PUBLIC
 	DB	243,15,30,250
+
 	movd	xmm5,r9d
 	pxor	xmm4,xmm4
 	pshufd	xmm5,xmm5,0
@@ -1868,6 +1863,7 @@ PUBLIC	vec_prefetch
 ALIGN	32
 vec_prefetch	PROC PUBLIC
 	DB	243,15,30,250
+
 	lea	rdx,QWORD PTR[((-1))+rdx*1+rcx]
 	mov	rax,64
 	xor	r8,r8
@@ -1909,6 +1905,7 @@ PUBLIC	vec_is_zero_16x
 ALIGN	32
 vec_is_zero_16x	PROC PUBLIC
 	DB	243,15,30,250
+
 	shr	edx,4
 	movdqu	xmm0,XMMWORD PTR[rcx]
 	lea	rcx,QWORD PTR[16+rcx]
@@ -1937,6 +1934,7 @@ PUBLIC	vec_is_equal_16x
 ALIGN	32
 vec_is_equal_16x	PROC PUBLIC
 	DB	243,15,30,250
+
 	shr	r8d,4
 	movdqu	xmm0,XMMWORD PTR[rcx]
 	movdqu	xmm1,XMMWORD PTR[rdx]
@@ -2154,8 +2152,9 @@ $L$SEH_info_add_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_add_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2167,7 +2166,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_add_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2178,8 +2178,9 @@ $L$SEH_info_add_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_add_mod_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,003h,000h
@@ -2191,7 +2192,8 @@ DB	000h,054h,008h,000h
 DB	000h,074h,00ah,000h
 DB	000h,064h,00bh,000h
 DB	000h,082h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_add_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2202,8 +2204,9 @@ $L$SEH_info_rshift_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_rshift_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2215,7 +2218,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_rshift_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2226,8 +2230,9 @@ $L$SEH_info_div_by_2_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_div_by_2_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2239,7 +2244,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_div_by_2_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2250,8 +2256,9 @@ $L$SEH_info_lshift_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_lshift_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2263,7 +2270,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_lshift_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2274,8 +2282,9 @@ $L$SEH_info_mul_by_3_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_3_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2287,7 +2296,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_by_3_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2298,8 +2308,9 @@ $L$SEH_info_mul_by_8_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_8_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2311,7 +2322,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_by_8_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2322,8 +2334,9 @@ $L$SEH_info_mul_by_3_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_3_mod_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2335,7 +2348,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_by_3_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2346,8 +2360,9 @@ $L$SEH_info_mul_by_8_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_8_mod_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2359,7 +2374,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_by_8_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2370,8 +2386,9 @@ $L$SEH_info_cneg_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_cneg_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2383,7 +2400,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_cneg_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2394,8 +2412,9 @@ $L$SEH_info_sub_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sub_mod_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -2407,7 +2426,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sub_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2418,8 +2438,9 @@ $L$SEH_info_sub_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sub_mod_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,003h,000h
@@ -2431,7 +2452,8 @@ DB	000h,054h,008h,000h
 DB	000h,074h,00ah,000h
 DB	000h,064h,00bh,000h
 DB	000h,082h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sub_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2442,8 +2464,9 @@ $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_by_1_plus_i_mod_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,007h,000h
@@ -2455,7 +2478,8 @@ DB	000h,054h,00ch,000h
 DB	000h,074h,00eh,000h
 DB	000h,064h,00fh,000h
 DB	000h,0c2h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2466,8 +2490,9 @@ $L$SEH_info_sgn0_pty_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0_pty_mod_384_body::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -2483,8 +2508,9 @@ $L$SEH_info_sgn0_pty_mod_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0_pty_mod_384x_body::
 DB	1,0,9,0
 DB	000h,034h,001h,000h
@@ -2492,7 +2518,8 @@ DB	000h,054h,002h,000h
 DB	000h,074h,004h,000h
 DB	000h,064h,005h,000h
 DB	000h,022h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sgn0_pty_mod_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
index 57d1752fd3c..59b51a910ce 100644
--- a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
+++ b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm
@@ -5,6 +5,7 @@ OPTION	DOTNAME
 ALIGN	32
 __add_mod_384x384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -72,6 +73,7 @@ __add_mod_384x384	ENDP
 ALIGN	32
 __sub_mod_384x384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -144,15 +146,14 @@ add_mod_384x384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_add_mod_384x384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -203,15 +204,14 @@ sub_mod_384x384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sub_mod_384x384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -285,8 +285,9 @@ $L$SEH_info_add_mod_384x384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_add_mod_384x384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -298,7 +299,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_add_mod_384x384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -309,8 +311,9 @@ $L$SEH_info_sub_mod_384x384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sub_mod_384x384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -322,7 +325,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sub_mod_384x384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def
index 3fbb6b3a97d..dda95336a93 100644
--- a/crypto/blst_src/build/win64/blst.def
+++ b/crypto/blst_src/build/win64/blst.def
@@ -152,6 +152,7 @@ EXPORTS
 	blst_sk_to_pk_in_g2
 	blst_sign_pk_in_g2
 	blst_miller_loop
+	blst_miller_loop_n
 	blst_final_exp
 	blst_precompute_lines
 	blst_miller_loop_lines
@@ -180,6 +181,8 @@ EXPORTS
 	BLS12_381_NEG_G1
 	BLS12_381_G2
 	BLS12_381_NEG_G2
+	blst_fr_ct_bfly
+	blst_fr_gs_bfly
 	blst_fr_to
 	blst_fr_from
 	blst_fp_to
@@ -214,4 +217,5 @@ EXPORTS
 	blst_p2_sizeof
 	blst_p2_affine_sizeof
 	blst_fp12_sizeof
+	blst_sha256
 
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
index f3c2f0d05f9..a4467904612 100644
--- a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm
@@ -1,6 +1,7 @@
 	AREA	|.text|,CODE,ALIGN=8,ARM64
 
 
+
 	EXPORT	|ct_inverse_mod_256|[FUNC]
 	ALIGN	32
 |ct_inverse_mod_256| PROC
@@ -60,14 +61,14 @@
 	madd	x4, x16, x8, xzr	// |u|*|f0|
 	madd	x4, x17, x9, x4	// |v|*|g0|
 	str	x4, [x0,#8*4]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*5]
 	stp	x5, x5, [x0,#8*7]
 
 	madd	x4, x12, x8, xzr	// |u|*|f1|
 	madd	x4, x13, x9, x4	// |v|*|g1|
 	str	x4, [x0,#8*9]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*10]
 	stp	x5, x5, [x0,#8*12]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
index 65665c9f17a..5cd09a1d8f2 100644
--- a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm
@@ -3,6 +3,7 @@ OPTION	DOTNAME
 
 PUBLIC	ct_inverse_mod_256
 
+
 ALIGN	32
 ct_inverse_mod_256	PROC PUBLIC
 	DB	243,15,30,250
@@ -10,15 +11,14 @@ ct_inverse_mod_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_ct_inverse_mod_256::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-	mov	rcx,r9
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
 	push	rbx
 
 	push	r12
@@ -643,6 +643,7 @@ ct_inverse_mod_256	ENDP
 ALIGN	32
 __smulq_512x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -791,6 +792,7 @@ __smulq_512x63	ENDP
 ALIGN	32
 __smulq_256x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[((0+0))+rsi]
 	mov	r9,QWORD PTR[((0+8))+rsi]
 	mov	r10,QWORD PTR[((0+16))+rsi]
@@ -898,6 +900,7 @@ __smulq_256x63	ENDP
 ALIGN	32
 __smulq_256_n_shift_by_31	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	QWORD PTR[rdi],rdx
 	mov	QWORD PTR[8+rdi],rcx
 	mov	rbp,rdx
@@ -1026,6 +1029,7 @@ __smulq_256_n_shift_by_31	ENDP
 ALIGN	32
 __ab_approximation_31_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r9,QWORD PTR[24+rsi]
 	mov	r11,QWORD PTR[56+rsi]
 	mov	rbx,QWORD PTR[16+rsi]
@@ -1079,6 +1083,7 @@ __ab_approximation_31_256	ENDP
 ALIGN	32
 __inner_loop_31_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rcx,07FFFFFFF80000000h
 	mov	r13,0800000007FFFFFFFh
 	mov	r15,07FFFFFFF7FFFFFFFh
@@ -1127,6 +1132,7 @@ __inner_loop_31_256	ENDP
 ALIGN	32
 __inner_loop_62_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r15d,edx
 	mov	rdx,1
 	xor	rcx,rcx
@@ -1187,8 +1193,9 @@ $L$SEH_info_ct_inverse_mod_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_ct_inverse_mod_256_body::
 DB	1,0,18,0
 DB	000h,0f4h,086h,000h
@@ -1200,6 +1207,8 @@ DB	000h,054h,08bh,000h
 DB	000h,074h,08dh,000h
 DB	000h,064h,08eh,000h
 DB	000h,001h,08ch,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_ct_inverse_mod_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
index 4ab12e052df..311ce7638ce 100644
--- a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
+++ b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm
@@ -1,6 +1,7 @@
 	AREA	|.text|,CODE,ALIGN=8,ARM64
 
 
+
 	EXPORT	|ct_inverse_mod_383|[FUNC]
 	ALIGN	32
 |ct_inverse_mod_383| PROC
@@ -71,7 +72,7 @@
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*6]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*8]
 	stp	x5, x5, [x0,#8*10]
 
@@ -82,7 +83,7 @@
 	adds	x3, x3, x5
 	adc	x4, x4, x6
 	stp	x3, x4, [x0,#8*12]
-	asr	x5, x4, #63		// sign extenstion
+	asr	x5, x4, #63		// sign extension
 	stp	x5, x5, [x0,#8*14]
 	stp	x5, x5, [x0,#8*16]
 	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
index ab72328f056..e2454897b33 100644
--- a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
+++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm
@@ -1,6 +1,7 @@
 	AREA	|.text|,CODE,ALIGN=8,ARM64
 
 
+
 	EXPORT	|ct_is_square_mod_384|[FUNC]
 	ALIGN	32
 |ct_is_square_mod_384| PROC
diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
index 38de6fc1229..be00f479efb 100644
--- a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm
@@ -3,6 +3,7 @@ OPTION	DOTNAME
 
 PUBLIC	ct_is_square_mod_384
 
+
 ALIGN	32
 ct_is_square_mod_384	PROC PUBLIC
 	DB	243,15,30,250
@@ -10,13 +11,12 @@ ct_is_square_mod_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_ct_is_square_mod_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
 	push	rbp
 
+	mov	rdi,rcx
+	mov	rsi,rdx
 	push	rbx
 
 	push	r12
@@ -133,6 +133,7 @@ ct_is_square_mod_384	ENDP
 ALIGN	32
 __smulq_384_n_shift_by_30	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -305,6 +306,7 @@ __smulq_384_n_shift_by_30	ENDP
 ALIGN	32
 __ab_approximation_30	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbx,QWORD PTR[88+rsi]
 	mov	r15,QWORD PTR[80+rsi]
 	mov	r14,QWORD PTR[72+rsi]
@@ -369,6 +371,7 @@ __ab_approximation_30	ENDP
 ALIGN	32
 __inner_loop_30	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbx,07FFFFFFF80000000h
 	mov	rcx,0800000007FFFFFFFh
 	lea	r15,QWORD PTR[((-1))+rbx]
@@ -430,6 +433,7 @@ __inner_loop_30	ENDP
 ALIGN	32
 __inner_loop_48	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	edi,48
 
 $L$oop_48::
@@ -485,8 +489,9 @@ $L$SEH_info_ct_is_square_mod_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_ct_is_square_mod_384_body::
 DB	1,0,18,0
 DB	000h,0f4h,043h,000h
@@ -498,6 +503,8 @@ DB	000h,054h,048h,000h
 DB	000h,074h,04ah,000h
 DB	000h,064h,04bh,000h
 DB	000h,001h,049h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_ct_is_square_mod_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
index de79f8ec80e..89fbe5d0666 100644
--- a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm
@@ -1,8 +1,13 @@
 OPTION	DOTNAME
+EXTERN	ct_inverse_mod_383$1:NEAR
+_DATA	SEGMENT
+COMM	__blst_platform_cap:DWORD:1
+_DATA	ENDS
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 PUBLIC	ct_inverse_mod_383
 
+
 ALIGN	32
 ct_inverse_mod_383	PROC PUBLIC
 	DB	243,15,30,250
@@ -10,13 +15,16 @@ ct_inverse_mod_383	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_ct_inverse_mod_383::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	ct_inverse_mod_383$1
+endif
 	push	rbp
 
 	push	rbx
@@ -548,6 +556,7 @@ ct_inverse_mod_383	ENDP
 ALIGN	32
 __smulq_767x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -758,6 +767,7 @@ __smulq_767x63	ENDP
 ALIGN	32
 __smulq_383x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -899,6 +909,7 @@ __smulq_383x63	ENDP
 ALIGN	32
 __smulq_383_n_shift_by_62	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbx,rdx
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
@@ -1075,6 +1086,7 @@ __smulq_383_n_shift_by_62	ENDP
 ALIGN	32
 __ab_approximation_62	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r9,QWORD PTR[40+rsi]
 	mov	r11,QWORD PTR[88+rsi]
 	mov	rbx,QWORD PTR[32+rsi]
@@ -1131,6 +1143,7 @@ ALIGN	8
 	DD	0
 __inner_loop_62	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rdx,1
 	xor	rcx,rcx
 	xor	r12,r12
@@ -1200,8 +1213,9 @@ $L$SEH_info_ct_inverse_mod_383_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_ct_inverse_mod_383_body::
 DB	1,0,18,0
 DB	000h,0f4h,08bh,000h
@@ -1213,6 +1227,8 @@ DB	000h,054h,090h,000h
 DB	000h,074h,092h,000h
 DB	000h,064h,093h,000h
 DB	000h,001h,091h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_ct_inverse_mod_383_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
index df4c46a4c44..024da69a645 100644
--- a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm
@@ -1,8 +1,10 @@
 OPTION	DOTNAME
+PUBLIC	ct_inverse_mod_383$1
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 PUBLIC	ctx_inverse_mod_383
 
+
 ALIGN	32
 ctx_inverse_mod_383	PROC PUBLIC
 	DB	243,15,30,250
@@ -10,13 +12,13 @@ ctx_inverse_mod_383	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_ctx_inverse_mod_383::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ct_inverse_mod_383$1::
 	push	rbp
 
 	push	rbx
@@ -814,7 +816,7 @@ $L$SEH_body_ctx_inverse_mod_383::
 
 	mov	r10,QWORD PTR[48+rsi]
 
-	call	__inner_loop_62
+	call	__tail_loop_53
 
 
 
@@ -890,6 +892,7 @@ ctx_inverse_mod_383	ENDP
 ALIGN	32
 __smulx_767x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -1054,6 +1057,7 @@ __smulx_767x63	ENDP
 ALIGN	32
 __smulx_383x63	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[((0+0))+rsi]
 	mov	r9,QWORD PTR[((0+8))+rsi]
 	mov	r10,QWORD PTR[((0+16))+rsi]
@@ -1161,6 +1165,7 @@ __smulx_383x63	ENDP
 ALIGN	32
 __smulx_383_n_shift_by_31	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbx,rdx
 	xor	r14,r14
 	mov	r8,QWORD PTR[((0+0))+rsi]
@@ -1306,6 +1311,7 @@ __smulx_383_n_shift_by_31	ENDP
 ALIGN	32
 __smulx_191_n_shift_by_31	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbx,rdx
 	mov	r8,QWORD PTR[((0+0))+rsi]
 	mov	r9,QWORD PTR[((0+8))+rsi]
@@ -1397,6 +1403,7 @@ __smulx_191_n_shift_by_31	ENDP
 ALIGN	32
 __ab_approximation_31	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r9,QWORD PTR[40+rsi]
 	mov	r11,QWORD PTR[88+rsi]
 	mov	rbx,QWORD PTR[32+rsi]
@@ -1467,6 +1474,7 @@ __ab_approximation_31	ENDP
 ALIGN	32
 __inner_loop_31	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rcx,07FFFFFFF80000000h
 	mov	r13,0800000007FFFFFFFh
 	mov	r15,07FFFFFFF7FFFFFFFh
@@ -1513,14 +1521,15 @@ __inner_loop_31	ENDP
 
 
 ALIGN	32
-__inner_loop_62	PROC PRIVATE
+__tail_loop_53	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rdx,1
 	xor	rcx,rcx
 	xor	r12,r12
 	mov	r13,1
 
-$L$oop_62::
+$L$oop_53::
 	xor	rax,rax
 	test	r8,1
 	mov	rbx,r10
@@ -1547,10 +1556,10 @@ $L$oop_62::
 	sub	rdx,rax
 	sub	rcx,rbx
 	sub	edi,1
-	jnz	$L$oop_62
+	jnz	$L$oop_53
 
 	DB	0F3h,0C3h		;repret
-__inner_loop_62	ENDP
+__tail_loop_53	ENDP
 .text$	ENDS
 .pdata	SEGMENT READONLY ALIGN(4)
 ALIGN	4
@@ -1573,8 +1582,9 @@ $L$SEH_info_ctx_inverse_mod_383_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_ctx_inverse_mod_383_body::
 DB	1,0,18,0
 DB	000h,0f4h,08bh,000h
@@ -1586,6 +1596,8 @@ DB	000h,054h,090h,000h
 DB	000h,074h,092h,000h
 DB	000h,064h,093h,000h
 DB	000h,001h,091h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_ctx_inverse_mod_383_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm
index 7114ccf0c2e..aec90679eea 100644
--- a/crypto/blst_src/build/win64/div3w-armv8.asm
+++ b/crypto/blst_src/build/win64/div3w-armv8.asm
@@ -25,7 +25,7 @@
 	asr	x3,x0,#63	// top bit -> mask
 	add	x0,x0,x0	// Q <<= 1
 	subs	x6,x4,x1	// R - D
-	add	x0,x0,#1	// Q + specilative bit
+	add	x0,x0,#1	// Q + speculative bit
 	sbcs	x7,x5,x2
 	sbc	x0,x0,xzr	// subtract speculative bit
 
diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm
index c35f426f3d2..805c5b1fcb0 100644
--- a/crypto/blst_src/build/win64/div3w-x86_64.asm
+++ b/crypto/blst_src/build/win64/div3w-x86_64.asm
@@ -9,12 +9,14 @@ div_3_limbs	PROC PUBLIC
 	DB	243,15,30,250
 	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD PTR[16+rsp],rsi
-	mov	rax,rsp
+	mov	r11,rsp
 $L$SEH_begin_div_3_limbs::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
+$L$SEH_body_div_3_limbs::
 
 	mov	r8,QWORD PTR[rdi]
 	mov	r9,QWORD PTR[8+rdi]
@@ -47,9 +49,12 @@ $L$oop::
 
 	or	rax,rcx
 
+$L$SEH_epilogue_div_3_limbs::
 	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD PTR[16+rsp]
+
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_div_3_limbs::
 div_3_limbs	ENDP
 PUBLIC	quot_rem_128
@@ -60,12 +65,14 @@ quot_rem_128	PROC PUBLIC
 	DB	243,15,30,250
 	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD PTR[16+rsp],rsi
-	mov	rax,rsp
+	mov	r11,rsp
 $L$SEH_begin_quot_rem_128::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
+$L$SEH_body_quot_rem_128::
 
 	mov	rax,rdx
 	mov	rcx,rdx
@@ -101,9 +108,12 @@ $L$SEH_begin_quot_rem_128::
 
 	mov	rax,rcx
 
+$L$SEH_epilogue_quot_rem_128::
 	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD PTR[16+rsp]
+
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_quot_rem_128::
 quot_rem_128	ENDP
 
@@ -119,12 +129,14 @@ quot_rem_64	PROC PUBLIC
 	DB	243,15,30,250
 	mov	QWORD PTR[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD PTR[16+rsp],rsi
-	mov	rax,rsp
+	mov	r11,rsp
 $L$SEH_begin_quot_rem_64::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
+$L$SEH_body_quot_rem_64::
 
 	mov	rax,rdx
 	imul	rdx,QWORD PTR[rsi]
@@ -136,17 +148,110 @@ $L$SEH_begin_quot_rem_64::
 	mov	QWORD PTR[rdi],r10
 	mov	QWORD PTR[8+rdi],rax
 
+$L$SEH_epilogue_quot_rem_64::
 	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD PTR[16+rsp]
+
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_quot_rem_64::
 quot_rem_64	ENDP
 .text$	ENDS
 .pdata	SEGMENT READONLY ALIGN(4)
 ALIGN	4
+	DD	imagerel $L$SEH_begin_div_3_limbs
+	DD	imagerel $L$SEH_body_div_3_limbs
+	DD	imagerel $L$SEH_info_div_3_limbs_prologue
+
+	DD	imagerel $L$SEH_body_div_3_limbs
+	DD	imagerel $L$SEH_epilogue_div_3_limbs
+	DD	imagerel $L$SEH_info_div_3_limbs_body
+
+	DD	imagerel $L$SEH_epilogue_div_3_limbs
+	DD	imagerel $L$SEH_end_div_3_limbs
+	DD	imagerel $L$SEH_info_div_3_limbs_epilogue
+
+	DD	imagerel $L$SEH_begin_quot_rem_128
+	DD	imagerel $L$SEH_body_quot_rem_128
+	DD	imagerel $L$SEH_info_quot_rem_128_prologue
+
+	DD	imagerel $L$SEH_body_quot_rem_128
+	DD	imagerel $L$SEH_epilogue_quot_rem_128
+	DD	imagerel $L$SEH_info_quot_rem_128_body
+
+	DD	imagerel $L$SEH_epilogue_quot_rem_128
+	DD	imagerel $L$SEH_end_quot_rem_128
+	DD	imagerel $L$SEH_info_quot_rem_128_epilogue
+
+	DD	imagerel $L$SEH_begin_quot_rem_64
+	DD	imagerel $L$SEH_body_quot_rem_64
+	DD	imagerel $L$SEH_info_quot_rem_64_prologue
+
+	DD	imagerel $L$SEH_body_quot_rem_64
+	DD	imagerel $L$SEH_epilogue_quot_rem_64
+	DD	imagerel $L$SEH_info_quot_rem_64_body
+
+	DD	imagerel $L$SEH_epilogue_quot_rem_64
+	DD	imagerel $L$SEH_end_quot_rem_64
+	DD	imagerel $L$SEH_info_quot_rem_64_epilogue
+
 .pdata	ENDS
 .xdata	SEGMENT READONLY ALIGN(8)
 ALIGN	8
+$L$SEH_info_div_3_limbs_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,0b3h
+DB	0,0
+	DD	0,0
+$L$SEH_info_div_3_limbs_body::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+$L$SEH_info_div_3_limbs_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_quot_rem_128_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,0b3h
+DB	0,0
+	DD	0,0
+$L$SEH_info_quot_rem_128_body::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+$L$SEH_info_quot_rem_128_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
+$L$SEH_info_quot_rem_64_prologue::
+DB	1,0,5,00bh
+DB	0,074h,1,0
+DB	0,064h,2,0
+DB	0,0b3h
+DB	0,0
+	DD	0,0
+$L$SEH_info_quot_rem_64_body::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+$L$SEH_info_quot_rem_64_epilogue::
+DB	1,0,4,0
+DB	000h,074h,001h,000h
+DB	000h,064h,002h,000h
+DB	000h,000h,000h,000h
+
 
 .xdata	ENDS
 END
diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
index c3bf8634617..6aedca7cdaf 100644
--- a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
+++ b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm
@@ -1,4 +1,11 @@
 OPTION	DOTNAME
+EXTERN	mul_mont_sparse_256$1:NEAR
+EXTERN	sqr_mont_sparse_256$1:NEAR
+EXTERN	from_mont_256$1:NEAR
+EXTERN	redc_mont_256$1:NEAR
+_DATA	SEGMENT
+COMM	__blst_platform_cap:DWORD:1
+_DATA	ENDS
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 PUBLIC	mul_mont_sparse_256
@@ -11,14 +18,17 @@ mul_mont_sparse_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_mont_sparse_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	mul_mont_sparse_256$1
+endif
 	push	rbp
 
 	push	rbx
@@ -83,13 +93,16 @@ sqr_mont_sparse_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_mont_sparse_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_mont_sparse_256$1
+endif
 	push	rbp
 
 	push	rbx
@@ -148,6 +161,7 @@ sqr_mont_sparse_256	ENDP
 ALIGN	32
 __mulq_mont_sparse_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mul	r14
 	add	r10,rax
 	mov	rax,r15
@@ -434,13 +448,16 @@ from_mont_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_from_mont_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	from_mont_256$1
+endif
 	push	rbp
 
 	push	rbx
@@ -516,13 +533,16 @@ redc_mont_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_redc_mont_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	redc_mont_256$1
+endif
 	push	rbp
 
 	push	rbx
@@ -597,6 +617,7 @@ redc_mont_256	ENDP
 ALIGN	32
 __mulq_by_1_mont_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rax,QWORD PTR[rsi]
 	mov	r10,QWORD PTR[8+rsi]
 	mov	r11,QWORD PTR[16+rsi]
@@ -787,8 +808,9 @@ $L$SEH_info_mul_mont_sparse_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_mont_sparse_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -800,7 +822,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_mont_sparse_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -811,8 +834,9 @@ $L$SEH_info_sqr_mont_sparse_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_mont_sparse_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -824,7 +848,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_mont_sparse_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -835,8 +860,9 @@ $L$SEH_info_from_mont_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_from_mont_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -848,7 +874,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_from_mont_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -859,8 +886,9 @@ $L$SEH_info_redc_mont_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_redc_mont_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -872,7 +900,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_redc_mont_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
index 0ccb46786c3..8563815917e 100644
--- a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm
@@ -1,4 +1,22 @@
 OPTION	DOTNAME
+EXTERN	mul_mont_384x$1:NEAR
+EXTERN	sqr_mont_384x$1:NEAR
+EXTERN	mul_382x$1:NEAR
+EXTERN	sqr_382x$1:NEAR
+EXTERN	mul_384$1:NEAR
+EXTERN	sqr_384$1:NEAR
+EXTERN	redc_mont_384$1:NEAR
+EXTERN	from_mont_384$1:NEAR
+EXTERN	sgn0_pty_mont_384$1:NEAR
+EXTERN	sgn0_pty_mont_384x$1:NEAR
+EXTERN	mul_mont_384$1:NEAR
+EXTERN	sqr_mont_384$1:NEAR
+EXTERN	sqr_n_mul_mont_384$1:NEAR
+EXTERN	sqr_n_mul_mont_383$1:NEAR
+EXTERN	sqr_mont_382x$1:NEAR
+_DATA	SEGMENT
+COMM	__blst_platform_cap:DWORD:1
+_DATA	ENDS
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 
@@ -9,8 +27,9 @@ OPTION	DOTNAME
 
 
 ALIGN	32
-__sub_mod_384x384	PROC PRIVATE
+__subq_mod_384x384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -71,12 +90,13 @@ __sub_mod_384x384	PROC PRIVATE
 	mov	QWORD PTR[88+rdi],rsi
 
 	DB	0F3h,0C3h		;repret
-__sub_mod_384x384	ENDP
+__subq_mod_384x384	ENDP
 
 
 ALIGN	32
-__add_mod_384	PROC PRIVATE
+__addq_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -120,12 +140,13 @@ __add_mod_384	PROC PRIVATE
 	mov	QWORD PTR[40+rdi],r13
 
 	DB	0F3h,0C3h		;repret
-__add_mod_384	ENDP
+__addq_mod_384	ENDP
 
 
 ALIGN	32
-__sub_mod_384	PROC PRIVATE
+__subq_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -133,7 +154,7 @@ __sub_mod_384	PROC PRIVATE
 	mov	r12,QWORD PTR[32+rsi]
 	mov	r13,QWORD PTR[40+rsi]
 
-__sub_mod_384_a_is_loaded::
+__subq_mod_384_a_is_loaded::
 	sub	r8,QWORD PTR[rdx]
 	mov	r14,QWORD PTR[rcx]
 	sbb	r9,QWORD PTR[8+rdx]
@@ -169,7 +190,7 @@ __sub_mod_384_a_is_loaded::
 	mov	QWORD PTR[40+rdi],r13
 
 	DB	0F3h,0C3h		;repret
-__sub_mod_384	ENDP
+__subq_mod_384	ENDP
 PUBLIC	mul_mont_384x
 
 
@@ -180,14 +201,17 @@ mul_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	mul_mont_384x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -228,12 +252,12 @@ $L$SEH_body_mul_mont_384x::
 	mov	rcx,QWORD PTR[8+rsp]
 	lea	rdx,QWORD PTR[((-48))+rsi]
 	lea	rdi,QWORD PTR[((40+192+48))+rsp]
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	mov	rsi,QWORD PTR[16+rsp]
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[((-48))+rdi]
-	call	__add_mod_384
+	call	__addq_mod_384
 
 	lea	rbx,QWORD PTR[rdi]
 	lea	rsi,QWORD PTR[48+rdi]
@@ -243,17 +267,17 @@ $L$SEH_body_mul_mont_384x::
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[40+rsp]
 	mov	rcx,QWORD PTR[8+rsp]
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	lea	rsi,QWORD PTR[40+rsp]
 	lea	rdx,QWORD PTR[((40+96))+rsp]
 	lea	rdi,QWORD PTR[40+rsp]
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	mov	rbx,rcx
 
@@ -262,14 +286,14 @@ $L$SEH_body_mul_mont_384x::
 	mov	rcx,QWORD PTR[rsp]
 	mov	rdi,QWORD PTR[32+rsp]
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 
 	lea	rsi,QWORD PTR[((40+192))+rsp]
 	mov	rcx,QWORD PTR[rsp]
 	lea	rdi,QWORD PTR[48+rdi]
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	lea	r8,QWORD PTR[328+rsp]
 	mov	r15,QWORD PTR[r8]
@@ -304,13 +328,16 @@ sqr_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_mont_384x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -336,13 +363,13 @@ $L$SEH_body_sqr_mont_384x::
 
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[32+rsp]
-	call	__add_mod_384
+	call	__addq_mod_384
 
 
 	mov	rsi,QWORD PTR[16+rsp]
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[((32+48))+rsp]
-	call	__sub_mod_384
+	call	__subq_mod_384
 
 
 	mov	rsi,QWORD PTR[16+rsp]
@@ -435,13 +462,16 @@ mul_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	mul_382x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -531,18 +561,18 @@ $L$SEH_body_mul_382x::
 	lea	rdx,QWORD PTR[32+rsp]
 	mov	rcx,QWORD PTR[24+rsp]
 	mov	rdi,rsi
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 
 	lea	rsi,QWORD PTR[((-96))+rdi]
 	lea	rdx,QWORD PTR[32+rsp]
 	lea	rdi,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subq_mod_384x384
 
 	lea	r8,QWORD PTR[136+rsp]
 	mov	r15,QWORD PTR[r8]
@@ -577,12 +607,15 @@ sqr_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_382x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -633,7 +666,7 @@ $L$SEH_body_sqr_382x::
 
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[48+rdi]
-	call	__sub_mod_384_a_is_loaded
+	call	__subq_mod_384_a_is_loaded
 
 
 	lea	rsi,QWORD PTR[rdi]
@@ -716,12 +749,15 @@ mul_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	mul_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -755,6 +791,7 @@ mul_384	ENDP
 ALIGN	32
 __mulq_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rax,QWORD PTR[rbx]
 
 	mov	rbp,rax
@@ -1046,11 +1083,14 @@ sqr_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -1097,6 +1137,7 @@ sqr_384	ENDP
 ALIGN	32
 __sqrq_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rax,QWORD PTR[rsi]
 	mov	r15,QWORD PTR[8+rsi]
 	mov	rcx,QWORD PTR[16+rsi]
@@ -1294,13 +1335,16 @@ sqr_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -1330,7 +1374,7 @@ $L$SEH_body_sqr_mont_384::
 	mov	rbx,QWORD PTR[104+rsp]
 	mov	rdi,QWORD PTR[112+rsp]
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	lea	r8,QWORD PTR[120+rsp]
 	mov	r15,QWORD PTR[120+rsp]
@@ -1368,13 +1412,16 @@ redc_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_redc_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	redc_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -1394,7 +1441,7 @@ $L$SEH_body_redc_mont_384::
 
 	mov	rbx,rdx
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	mov	r15,QWORD PTR[8+rsp]
 
@@ -1432,13 +1479,16 @@ from_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_from_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	from_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -1515,6 +1565,7 @@ from_mont_384	ENDP
 ALIGN	32
 __mulq_by_1_mont_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rax,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -1810,8 +1861,9 @@ __mulq_by_1_mont_384	ENDP
 
 
 ALIGN	32
-__redc_tail_mont_384	PROC PRIVATE
+__redq_tail_mont_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	add	r14,QWORD PTR[48+rsi]
 	mov	rax,r14
 	adc	r15,QWORD PTR[56+rsi]
@@ -1852,7 +1904,7 @@ __redc_tail_mont_384	PROC PRIVATE
 	mov	QWORD PTR[40+rdi],r11
 
 	DB	0F3h,0C3h		;repret
-__redc_tail_mont_384	ENDP
+__redq_tail_mont_384	ENDP
 
 PUBLIC	sgn0_pty_mont_384
 
@@ -1864,12 +1916,15 @@ sgn0_pty_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0_pty_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sgn0_pty_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -1948,12 +2003,15 @@ sgn0_pty_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0_pty_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sgn0_pty_mont_384x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -2081,14 +2139,17 @@ mul_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mul_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	mul_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -2143,6 +2204,7 @@ mul_mont_384	ENDP
 ALIGN	32
 __mulq_mont_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rdi,rax
 	mul	r14
 	mov	r8,rax
@@ -2750,15 +2812,18 @@ sqr_n_mul_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_n_mul_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
 	mov	r9,QWORD PTR[48+rsp]
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_n_mul_mont_384$1
+endif
 	push	rbp
 
 	push	rbx
@@ -2792,7 +2857,7 @@ $L$oop_sqr_384::
 	mov	rcx,QWORD PTR[rsp]
 	mov	rbx,QWORD PTR[16+rsp]
 	call	__mulq_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redq_tail_mont_384
 
 	movd	edx,xmm1
 	lea	rsi,QWORD PTR[rdi]
@@ -2847,15 +2912,18 @@ sqr_n_mul_mont_383	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_n_mul_mont_383::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
 	mov	r9,QWORD PTR[48+rsp]
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_n_mul_mont_383$1
+endif
 	push	rbp
 
 	push	rbx
@@ -2950,6 +3018,7 @@ sqr_n_mul_mont_383	ENDP
 ALIGN	32
 __mulq_mont_383_nonred	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rbp,rax
 	mul	r14
 	mov	r8,rax
@@ -3514,13 +3583,16 @@ sqr_mont_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqr_mont_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+ifdef __BLST_PORTABLE__
+	test	DWORD PTR[__blst_platform_cap],1
+	jnz	sqr_mont_382x$1
+endif
 	push	rbp
 
 	push	rbx
@@ -3882,8 +3954,9 @@ $L$SEH_info_mul_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_mont_384x_body::
 DB	1,0,18,0
 DB	000h,0f4h,029h,000h
@@ -3895,6 +3968,8 @@ DB	000h,054h,02eh,000h
 DB	000h,074h,030h,000h
 DB	000h,064h,031h,000h
 DB	000h,001h,02fh,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3905,8 +3980,9 @@ $L$SEH_info_sqr_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_mont_384x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -3918,6 +3994,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3928,8 +4006,9 @@ $L$SEH_info_mul_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_382x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -3941,6 +4020,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3951,8 +4032,9 @@ $L$SEH_info_sqr_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_382x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3964,7 +4046,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3975,8 +4058,9 @@ $L$SEH_info_mul_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_384_body::
 DB	1,0,11,0
 DB	000h,0c4h,000h,000h
@@ -3996,8 +4080,9 @@ $L$SEH_info_sqr_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -4009,7 +4094,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4020,8 +4106,9 @@ $L$SEH_info_sqr_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_mont_384_body::
 DB	1,0,18,0
 DB	000h,0f4h,00fh,000h
@@ -4033,6 +4120,8 @@ DB	000h,054h,014h,000h
 DB	000h,074h,016h,000h
 DB	000h,064h,017h,000h
 DB	000h,001h,015h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4043,8 +4132,9 @@ $L$SEH_info_redc_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_redc_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -4056,7 +4146,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_redc_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4067,8 +4158,9 @@ $L$SEH_info_from_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_from_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -4080,7 +4172,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_from_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4091,8 +4184,9 @@ $L$SEH_info_sgn0_pty_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0_pty_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -4104,7 +4198,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sgn0_pty_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4115,8 +4210,9 @@ $L$SEH_info_sgn0_pty_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0_pty_mont_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -4128,7 +4224,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sgn0_pty_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4139,8 +4236,9 @@ $L$SEH_info_mul_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mul_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,003h,000h
@@ -4152,7 +4250,8 @@ DB	000h,054h,008h,000h
 DB	000h,074h,00ah,000h
 DB	000h,064h,00bh,000h
 DB	000h,082h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mul_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4163,8 +4262,9 @@ $L$SEH_info_sqr_n_mul_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_n_mul_mont_384_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -4176,6 +4276,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_n_mul_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4186,8 +4288,9 @@ $L$SEH_info_sqr_n_mul_mont_383_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_n_mul_mont_383_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -4199,6 +4302,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_n_mul_mont_383_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -4209,8 +4314,9 @@ $L$SEH_info_sqr_mont_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqr_mont_382x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -4222,6 +4328,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqr_mont_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
index 83534c629e9..21d18a8b40b 100644
--- a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
+++ b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm
@@ -1,4 +1,8 @@
 OPTION	DOTNAME
+PUBLIC	mul_mont_sparse_256$1
+PUBLIC	sqr_mont_sparse_256$1
+PUBLIC	from_mont_256$1
+PUBLIC	redc_mont_256$1
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 PUBLIC	mulx_mont_sparse_256
@@ -11,14 +15,14 @@ mulx_mont_sparse_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mulx_mont_sparse_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+mul_mont_sparse_256$1::
 	push	rbp
 
 	push	rbx
@@ -81,13 +85,13 @@ sqrx_mont_sparse_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_mont_sparse_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+sqr_mont_sparse_256$1::
 	push	rbp
 
 	push	rbx
@@ -144,6 +148,7 @@ sqrx_mont_sparse_256	ENDP
 ALIGN	32
 __mulx_mont_sparse_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mulx	r12,r15,r15
 	mulx	r13,rbp,rbp
 	add	r11,r15
@@ -346,13 +351,13 @@ fromx_mont_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_fromx_mont_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+from_mont_256$1::
 	push	rbp
 
 	push	rbx
@@ -428,13 +433,13 @@ redcx_mont_256	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_redcx_mont_256::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+redc_mont_256$1::
 	push	rbp
 
 	push	rbx
@@ -509,6 +514,7 @@ redcx_mont_256	ENDP
 ALIGN	32
 __mulx_by_1_mont_256	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rax,QWORD PTR[rsi]
 	mov	r11,QWORD PTR[8+rsi]
 	mov	r12,QWORD PTR[16+rsi]
@@ -699,8 +705,9 @@ $L$SEH_info_mulx_mont_sparse_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mulx_mont_sparse_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -712,7 +719,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mulx_mont_sparse_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -723,8 +731,9 @@ $L$SEH_info_sqrx_mont_sparse_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_mont_sparse_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -736,7 +745,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_mont_sparse_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -747,8 +757,9 @@ $L$SEH_info_fromx_mont_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_fromx_mont_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -760,7 +771,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_fromx_mont_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -771,8 +783,9 @@ $L$SEH_info_redcx_mont_256_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_redcx_mont_256_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -784,7 +797,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_redcx_mont_256_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
index 25bee97731b..4dc41b04098 100644
--- a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
+++ b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm
@@ -1,4 +1,19 @@
 OPTION	DOTNAME
+PUBLIC	mul_mont_384x$1
+PUBLIC	sqr_mont_384x$1
+PUBLIC	mul_382x$1
+PUBLIC	sqr_382x$1
+PUBLIC	mul_384$1
+PUBLIC	sqr_384$1
+PUBLIC	redc_mont_384$1
+PUBLIC	from_mont_384$1
+PUBLIC	sgn0_pty_mont_384$1
+PUBLIC	sgn0_pty_mont_384x$1
+PUBLIC	mul_mont_384$1
+PUBLIC	sqr_mont_384$1
+PUBLIC	sqr_n_mul_mont_384$1
+PUBLIC	sqr_n_mul_mont_383$1
+PUBLIC	sqr_mont_382x$1
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 
@@ -9,8 +24,9 @@ OPTION	DOTNAME
 
 
 ALIGN	32
-__sub_mod_384x384	PROC PRIVATE
+__subx_mod_384x384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -71,12 +87,13 @@ __sub_mod_384x384	PROC PRIVATE
 	mov	QWORD PTR[88+rdi],rsi
 
 	DB	0F3h,0C3h		;repret
-__sub_mod_384x384	ENDP
+__subx_mod_384x384	ENDP
 
 
 ALIGN	32
-__add_mod_384	PROC PRIVATE
+__addx_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -120,12 +137,13 @@ __add_mod_384	PROC PRIVATE
 	mov	QWORD PTR[40+rdi],r13
 
 	DB	0F3h,0C3h		;repret
-__add_mod_384	ENDP
+__addx_mod_384	ENDP
 
 
 ALIGN	32
-__sub_mod_384	PROC PRIVATE
+__subx_mod_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	r9,QWORD PTR[8+rsi]
 	mov	r10,QWORD PTR[16+rsi]
@@ -133,7 +151,7 @@ __sub_mod_384	PROC PRIVATE
 	mov	r12,QWORD PTR[32+rsi]
 	mov	r13,QWORD PTR[40+rsi]
 
-__sub_mod_384_a_is_loaded::
+__subx_mod_384_a_is_loaded::
 	sub	r8,QWORD PTR[rdx]
 	mov	r14,QWORD PTR[rcx]
 	sbb	r9,QWORD PTR[8+rdx]
@@ -169,7 +187,7 @@ __sub_mod_384_a_is_loaded::
 	mov	QWORD PTR[40+rdi],r13
 
 	DB	0F3h,0C3h		;repret
-__sub_mod_384	ENDP
+__subx_mod_384	ENDP
 PUBLIC	mulx_mont_384x
 
 
@@ -180,14 +198,14 @@ mulx_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mulx_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+mul_mont_384x$1::
 	push	rbp
 
 	push	rbx
@@ -229,12 +247,12 @@ $L$SEH_body_mulx_mont_384x::
 	lea	rsi,QWORD PTR[rbx]
 	lea	rdx,QWORD PTR[((-48))+rbx]
 	lea	rdi,QWORD PTR[((40+192+48))+rsp]
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	mov	rsi,QWORD PTR[24+rsp]
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[((-48))+rdi]
-	call	__add_mod_384
+	call	__addx_mod_384
 
 	lea	rbx,QWORD PTR[rdi]
 	lea	rsi,QWORD PTR[48+rdi]
@@ -244,17 +262,17 @@ $L$SEH_body_mulx_mont_384x::
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[40+rsp]
 	mov	rcx,QWORD PTR[8+rsp]
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	lea	rsi,QWORD PTR[40+rsp]
 	lea	rdx,QWORD PTR[((40+96))+rsp]
 	lea	rdi,QWORD PTR[40+rsp]
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	lea	rbx,QWORD PTR[rcx]
 
@@ -263,14 +281,14 @@ $L$SEH_body_mulx_mont_384x::
 	mov	rcx,QWORD PTR[rsp]
 	mov	rdi,QWORD PTR[32+rsp]
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 
 	lea	rsi,QWORD PTR[((40+192))+rsp]
 	mov	rcx,QWORD PTR[rsp]
 	lea	rdi,QWORD PTR[48+rdi]
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	lea	r8,QWORD PTR[328+rsp]
 	mov	r15,QWORD PTR[r8]
@@ -305,13 +323,13 @@ sqrx_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+sqr_mont_384x$1::
 	push	rbp
 
 	push	rbx
@@ -338,13 +356,13 @@ $L$SEH_body_sqrx_mont_384x::
 
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[32+rsp]
-	call	__add_mod_384
+	call	__addx_mod_384
 
 
 	mov	rsi,QWORD PTR[24+rsp]
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[((32+48))+rsp]
-	call	__sub_mod_384
+	call	__subx_mod_384
 
 
 	mov	rsi,QWORD PTR[24+rsp]
@@ -447,13 +465,13 @@ mulx_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mulx_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+mul_382x$1::
 	push	rbp
 
 	push	rbx
@@ -543,18 +561,18 @@ $L$SEH_body_mulx_382x::
 	lea	rdx,QWORD PTR[32+rsp]
 	mov	rcx,QWORD PTR[24+rsp]
 	mov	rdi,rsi
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	lea	rsi,QWORD PTR[rdi]
 	lea	rdx,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 
 	lea	rsi,QWORD PTR[((-96))+rdi]
 	lea	rdx,QWORD PTR[32+rsp]
 	lea	rdi,QWORD PTR[((-96))+rdi]
-	call	__sub_mod_384x384
+	call	__subx_mod_384x384
 
 	lea	r8,QWORD PTR[136+rsp]
 	mov	r15,QWORD PTR[r8]
@@ -589,12 +607,12 @@ sqrx_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+sqr_382x$1::
 	push	rbp
 
 	push	rbx
@@ -645,7 +663,7 @@ $L$SEH_body_sqrx_382x::
 
 	lea	rdx,QWORD PTR[48+rsi]
 	lea	rdi,QWORD PTR[48+rdi]
-	call	__sub_mod_384_a_is_loaded
+	call	__subx_mod_384_a_is_loaded
 
 
 	lea	rsi,QWORD PTR[rdi]
@@ -728,12 +746,12 @@ mulx_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mulx_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+mul_384$1::
 	push	rbp
 
 	push	rbx
@@ -779,6 +797,7 @@ mulx_384	ENDP
 ALIGN	32
 __mulx_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rdx,QWORD PTR[rbx]
 	mov	r14,QWORD PTR[rsi]
 	mov	r15,QWORD PTR[8+rsi]
@@ -957,11 +976,11 @@ sqrx_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_384::
-	mov	rdi,rcx
-	mov	rsi,rdx
-
 
 
+	mov	rdi,rcx
+	mov	rsi,rdx
+sqr_384$1::
 	push	rbp
 
 	push	rbx
@@ -1007,6 +1026,7 @@ sqrx_384	ENDP
 ALIGN	32
 __sqrx_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	rdx,QWORD PTR[rsi]
 	mov	r14,QWORD PTR[8+rsi]
 	mov	r15,QWORD PTR[16+rsi]
@@ -1153,13 +1173,13 @@ redcx_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_redcx_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+redc_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -1179,7 +1199,7 @@ $L$SEH_body_redcx_mont_384::
 
 	mov	rbx,rdx
 	call	__mulx_by_1_mont_384
-	call	__redc_tail_mont_384
+	call	__redx_tail_mont_384
 
 	mov	r15,QWORD PTR[8+rsp]
 
@@ -1217,13 +1237,13 @@ fromx_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_fromx_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+from_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -1300,6 +1320,7 @@ fromx_mont_384	ENDP
 ALIGN	32
 __mulx_by_1_mont_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rsi]
 	mov	rdx,rcx
 	mov	r9,QWORD PTR[8+rsi]
@@ -1486,8 +1507,9 @@ __mulx_by_1_mont_384	ENDP
 
 
 ALIGN	32
-__redc_tail_mont_384	PROC PRIVATE
+__redx_tail_mont_384	PROC PRIVATE
 	DB	243,15,30,250
+
 	add	r14,QWORD PTR[48+rsi]
 	mov	rax,r14
 	adc	r15,QWORD PTR[56+rsi]
@@ -1528,7 +1550,7 @@ __redc_tail_mont_384	PROC PRIVATE
 	mov	QWORD PTR[40+rdi],r11
 
 	DB	0F3h,0C3h		;repret
-__redc_tail_mont_384	ENDP
+__redx_tail_mont_384	ENDP
 
 PUBLIC	sgn0x_pty_mont_384
 
@@ -1540,12 +1562,12 @@ sgn0x_pty_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0x_pty_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+sgn0_pty_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -1624,12 +1646,12 @@ sgn0x_pty_mont_384x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sgn0x_pty_mont_384x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
-
-
-
+sgn0_pty_mont_384x$1::
 	push	rbp
 
 	push	rbx
@@ -1757,14 +1779,14 @@ mulx_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_mulx_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
-
-
-
+mul_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -1825,6 +1847,7 @@ ALIGN	32
 __mulx_mont_384	PROC PRIVATE
 	DB	243,15,30,250
 
+
 	mulx	r10,r14,r15
 	mulx	r11,r15,rax
 	add	r9,r14
@@ -2230,13 +2253,13 @@ sqrx_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+sqr_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -2304,15 +2327,15 @@ sqrx_n_mul_mont_384	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_n_mul_mont_384::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
 	mov	r9,QWORD PTR[48+rsp]
-
-
-
+sqr_n_mul_mont_384$1::
 	push	rbp
 
 	push	rbx
@@ -2398,15 +2421,15 @@ sqrx_n_mul_mont_383	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_n_mul_mont_383::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
 	mov	r8,QWORD PTR[40+rsp]
 	mov	r9,QWORD PTR[48+rsp]
-
-
-
+sqr_n_mul_mont_383$1::
 	push	rbp
 
 	push	rbx
@@ -2485,6 +2508,7 @@ ALIGN	32
 __mulx_mont_383_nonred	PROC PRIVATE
 	DB	243,15,30,250
 
+
 	mulx	r10,r14,r15
 	mulx	r11,r15,rax
 	add	r9,r14
@@ -2851,13 +2875,13 @@ sqrx_mont_382x	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_sqrx_mont_382x::
+
+
 	mov	rdi,rcx
 	mov	rsi,rdx
 	mov	rdx,r8
 	mov	rcx,r9
-
-
-
+sqr_mont_382x$1::
 	push	rbp
 
 	push	rbx
@@ -3229,8 +3253,9 @@ $L$SEH_info_mulx_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mulx_mont_384x_body::
 DB	1,0,18,0
 DB	000h,0f4h,029h,000h
@@ -3242,6 +3267,8 @@ DB	000h,054h,02eh,000h
 DB	000h,074h,030h,000h
 DB	000h,064h,031h,000h
 DB	000h,001h,02fh,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mulx_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3252,8 +3279,9 @@ $L$SEH_info_sqrx_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_mont_384x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -3265,6 +3293,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3275,8 +3305,9 @@ $L$SEH_info_mulx_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mulx_382x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -3288,6 +3319,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mulx_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3298,8 +3331,9 @@ $L$SEH_info_sqrx_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_382x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3311,7 +3345,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3322,8 +3357,9 @@ $L$SEH_info_mulx_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mulx_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,000h,000h
@@ -3335,7 +3371,8 @@ DB	000h,054h,005h,000h
 DB	000h,074h,007h,000h
 DB	000h,064h,008h,000h
 DB	000h,052h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mulx_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3346,8 +3383,9 @@ $L$SEH_info_sqrx_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3359,7 +3397,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3370,8 +3409,9 @@ $L$SEH_info_redcx_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_redcx_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3383,7 +3423,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_redcx_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3394,8 +3435,9 @@ $L$SEH_info_fromx_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_fromx_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3407,7 +3449,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_fromx_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3418,8 +3461,9 @@ $L$SEH_info_sgn0x_pty_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0x_pty_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3431,7 +3475,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sgn0x_pty_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3442,8 +3487,9 @@ $L$SEH_info_sgn0x_pty_mont_384x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sgn0x_pty_mont_384x_body::
 DB	1,0,17,0
 DB	000h,0f4h,001h,000h
@@ -3455,7 +3501,8 @@ DB	000h,054h,006h,000h
 DB	000h,074h,008h,000h
 DB	000h,064h,009h,000h
 DB	000h,062h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sgn0x_pty_mont_384x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3466,8 +3513,9 @@ $L$SEH_info_mulx_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_mulx_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,003h,000h
@@ -3479,7 +3527,8 @@ DB	000h,054h,008h,000h
 DB	000h,074h,00ah,000h
 DB	000h,064h,00bh,000h
 DB	000h,082h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_mulx_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3490,8 +3539,9 @@ $L$SEH_info_sqrx_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,003h,000h
@@ -3503,7 +3553,8 @@ DB	000h,054h,008h,000h
 DB	000h,074h,00ah,000h
 DB	000h,064h,00bh,000h
 DB	000h,082h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3514,8 +3565,9 @@ $L$SEH_info_sqrx_n_mul_mont_384_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_n_mul_mont_384_body::
 DB	1,0,17,0
 DB	000h,0f4h,005h,000h
@@ -3527,7 +3579,8 @@ DB	000h,054h,00ah,000h
 DB	000h,074h,00ch,000h
 DB	000h,064h,00dh,000h
 DB	000h,0a2h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_n_mul_mont_384_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3538,8 +3591,9 @@ $L$SEH_info_sqrx_n_mul_mont_383_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_n_mul_mont_383_body::
 DB	1,0,17,0
 DB	000h,0f4h,005h,000h
@@ -3551,7 +3605,8 @@ DB	000h,054h,00ah,000h
 DB	000h,074h,00ch,000h
 DB	000h,064h,00dh,000h
 DB	000h,0a2h
-DB	000h,000h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_n_mul_mont_383_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
@@ -3562,8 +3617,9 @@ $L$SEH_info_sqrx_mont_382x_prologue::
 DB	1,0,5,00bh
 DB	0,074h,1,0
 DB	0,064h,2,0
-DB	0,003h
+DB	0,0b3h
 DB	0,0
+	DD	0,0
 $L$SEH_info_sqrx_mont_382x_body::
 DB	1,0,18,0
 DB	000h,0f4h,011h,000h
@@ -3575,6 +3631,8 @@ DB	000h,054h,016h,000h
 DB	000h,074h,018h,000h
 DB	000h,064h,019h,000h
 DB	000h,001h,017h,000h
+DB	000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_sqrx_mont_382x_epilogue::
 DB	1,0,4,0
 DB	000h,074h,001h,000h
diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm
index 0e0c54cb65b..31e74219c19 100644
--- a/crypto/blst_src/build/win64/sha256-armv8.asm
+++ b/crypto/blst_src/build/win64/sha256-armv8.asm
@@ -10,11 +10,12 @@
 //
 // sha256_block procedure for ARMv8.
 //
-// This module is stripped of scalar code paths, with raionale that all
+// This module is stripped of scalar code paths, with rationale that all
 // known processors are NEON-capable.
 //
 // See original module at CRYPTOGAMS for further details.
 
+	COMMON	|__blst_platform_cap|,4
 	AREA	|.text|,CODE,ALIGN=8,ARM64
 
 	ALIGN	64
@@ -184,6 +185,11 @@
 	EXPORT	|blst_sha256_block_data_order|[FUNC]
 	ALIGN	16
 |blst_sha256_block_data_order| PROC
+	adrp	x16,__blst_platform_cap
+	ldr	w16,[x16,__blst_platform_cap]
+	tst	w16,#1
+	bne	|$Lv8_entry|
+
 	stp	x29, x30, [sp, #-16]!
 	mov	x29, sp
 	sub	sp,sp,#16*4
diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm
index d3b409235e7..a502a75ecaf 100644
--- a/crypto/blst_src/build/win64/sha256-x86_64.asm
+++ b/crypto/blst_src/build/win64/sha256-x86_64.asm
@@ -1,4 +1,7 @@
 OPTION	DOTNAME
+_DATA	SEGMENT
+COMM	__blst_platform_cap:DWORD:1
+_DATA	ENDS
 .text$	SEGMENT ALIGN(256) 'CODE'
 
 ALIGN	64
@@ -38,23 +41,23 @@ blst_sha256_block_data_order_shaext	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_blst_sha256_block_data_order_shaext::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
-
-
-	sub	rsp,058h
 
-	movaps	XMMWORD PTR[(-88)+r11],xmm6
 
-	movaps	XMMWORD PTR[(-72)+r11],xmm7
+	push	rbp
 
-	movaps	XMMWORD PTR[(-56)+r11],xmm8
+	mov	rbp,rsp
 
-	movaps	XMMWORD PTR[(-40)+r11],xmm9
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+$L$blst_sha256_block_data_order$2::
+	sub	rsp,050h
 
-	movaps	XMMWORD PTR[(-24)+r11],xmm10
+	movaps	XMMWORD PTR[(-80)+rbp],xmm6
+	movaps	XMMWORD PTR[(-64)+rbp],xmm7
+	movaps	XMMWORD PTR[(-48)+rbp],xmm8
+	movaps	XMMWORD PTR[(-32)+rbp],xmm9
+	movaps	XMMWORD PTR[(-16)+rbp],xmm10
 
 $L$SEH_body_blst_sha256_block_data_order_shaext::
 
@@ -259,16 +262,18 @@ DB	102,15,58,15,215,8
 
 	movdqu	XMMWORD PTR[rdi],xmm1
 	movdqu	XMMWORD PTR[16+rdi],xmm2
-	movaps	xmm6,XMMWORD PTR[((-88))+r11]
-	movaps	xmm7,XMMWORD PTR[((-72))+r11]
-	movaps	xmm8,XMMWORD PTR[((-56))+r11]
-	movaps	xmm9,XMMWORD PTR[((-40))+r11]
-	movaps	xmm10,XMMWORD PTR[((-24))+r11]
-	mov	rsp,r11
+	movaps	xmm6,XMMWORD PTR[((-80))+rbp]
+	movaps	xmm7,XMMWORD PTR[((-64))+rbp]
+	movaps	xmm8,XMMWORD PTR[((-48))+rbp]
+	movaps	xmm9,XMMWORD PTR[((-32))+rbp]
+	movaps	xmm10,XMMWORD PTR[((-16))+rbp]
+	mov	rsp,rbp
+
+	pop	rbp
 
 $L$SEH_epilogue_blst_sha256_block_data_order_shaext::
-	mov	rdi,QWORD PTR[8+r11]	;WIN64 epilogue
-	mov	rsi,QWORD PTR[16+r11]
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
 
 	DB	0F3h,0C3h		;repret
 
@@ -284,14 +289,17 @@ blst_sha256_block_data_order	PROC PUBLIC
 	mov	QWORD PTR[16+rsp],rsi
 	mov	r11,rsp
 $L$SEH_begin_blst_sha256_block_data_order::
-	mov	rdi,rcx
-	mov	rsi,rdx
-	mov	rdx,r8
-
 
 
 	push	rbp
 
+	mov	rbp,rsp
+
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	test	DWORD PTR[__blst_platform_cap],2
+	jnz	$L$blst_sha256_block_data_order$2
 	push	rbx
 
 	push	r12
@@ -303,21 +311,16 @@ $L$SEH_begin_blst_sha256_block_data_order::
 	push	r15
 
 	shl	rdx,4
-	sub	rsp,104
+	sub	rsp,88
 
 	lea	rdx,QWORD PTR[rdx*4+rsi]
-	mov	QWORD PTR[rsp],rdi
+	mov	QWORD PTR[((-64))+rbp],rdi
 
-	mov	QWORD PTR[16+rsp],rdx
-	movaps	XMMWORD PTR[32+rsp],xmm6
-
-	movaps	XMMWORD PTR[48+rsp],xmm7
-
-	movaps	XMMWORD PTR[64+rsp],xmm8
-
-	movaps	XMMWORD PTR[80+rsp],xmm9
-
-	mov	rbp,rsp
+	mov	QWORD PTR[((-48))+rbp],rdx
+	movaps	XMMWORD PTR[(-128)+rbp],xmm6
+	movaps	XMMWORD PTR[(-112)+rbp],xmm7
+	movaps	XMMWORD PTR[(-96)+rbp],xmm8
+	movaps	XMMWORD PTR[(-80)+rbp],xmm9
 
 $L$SEH_body_blst_sha256_block_data_order::
 
@@ -338,7 +341,7 @@ $L$SEH_body_blst_sha256_block_data_order::
 ALIGN	16
 $L$loop_ssse3::
 	movdqa	xmm7,XMMWORD PTR[((K256+256))]
-	mov	QWORD PTR[8+rbp],rsi
+	mov	QWORD PTR[((-56))+rbp],rsi
 	movdqu	xmm0,XMMWORD PTR[rsi]
 	movdqu	xmm1,XMMWORD PTR[16+rsi]
 	movdqu	xmm2,XMMWORD PTR[32+rsi]
@@ -1363,9 +1366,9 @@ DB	102,15,58,15,249,4
 	add	eax,r15d
 	mov	r13d,r8d
 	add	r14d,eax
-	mov	rdi,QWORD PTR[rbp]
+	mov	rdi,QWORD PTR[((-64))+rbp]
 	mov	eax,r14d
-	mov	rsi,QWORD PTR[8+rbp]
+	mov	rsi,QWORD PTR[((-56))+rbp]
 
 	add	eax,DWORD PTR[rdi]
 	add	ebx,DWORD PTR[4+rdi]
@@ -1377,7 +1380,7 @@ DB	102,15,58,15,249,4
 	add	r11d,DWORD PTR[28+rdi]
 
 	lea	rsi,QWORD PTR[64+rsi]
-	cmp	rsi,QWORD PTR[16+rbp]
+	cmp	rsi,QWORD PTR[((-48))+rbp]
 
 	mov	DWORD PTR[rdi],eax
 	mov	DWORD PTR[4+rdi],ebx
@@ -1390,33 +1393,27 @@ DB	102,15,58,15,249,4
 	jb	$L$loop_ssse3
 
 	xorps	xmm0,xmm0
-	lea	r11,QWORD PTR[((104+48))+rbp]
-
 	movaps	XMMWORD PTR[rsp],xmm0
 	movaps	XMMWORD PTR[16+rsp],xmm0
 	movaps	XMMWORD PTR[32+rsp],xmm0
 	movaps	XMMWORD PTR[48+rsp],xmm0
-	movaps	xmm6,XMMWORD PTR[32+rbp]
-	movaps	xmm7,XMMWORD PTR[48+rbp]
-	movaps	xmm8,XMMWORD PTR[64+rbp]
-	movaps	xmm9,XMMWORD PTR[80+rbp]
-	mov	r15,QWORD PTR[104+rbp]
-
-	mov	r14,QWORD PTR[((-40))+r11]
-
-	mov	r13,QWORD PTR[((-32))+r11]
-
-	mov	r12,QWORD PTR[((-24))+r11]
-
-	mov	rbx,QWORD PTR[((-16))+r11]
-
-	mov	rbp,QWORD PTR[((-8))+r11]
+	movaps	xmm6,XMMWORD PTR[((-128))+rbp]
+	movaps	xmm7,XMMWORD PTR[((-112))+rbp]
+	movaps	xmm8,XMMWORD PTR[((-96))+rbp]
+	movaps	xmm9,XMMWORD PTR[((-80))+rbp]
+	mov	r15,QWORD PTR[((-40))+rbp]
+	mov	r14,QWORD PTR[((-32))+rbp]
+	mov	r13,QWORD PTR[((-24))+rbp]
+	mov	r12,QWORD PTR[((-16))+rbp]
+	mov	rbx,QWORD PTR[((-8))+rbp]
+	mov	rsp,rbp
+
+	pop	rbp
 
 $L$SEH_epilogue_blst_sha256_block_data_order::
-	mov	rdi,QWORD PTR[8+r11]	;WIN64 epilogue
-	mov	rsi,QWORD PTR[16+r11]
+	mov	rdi,QWORD PTR[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD PTR[16+rsp]
 
-	lea	rsp,QWORD PTR[r11]
 	DB	0F3h,0C3h		;repret
 
 $L$SEH_end_blst_sha256_block_data_order::
@@ -1427,6 +1424,7 @@ PUBLIC	blst_sha256_emit
 ALIGN	16
 blst_sha256_emit	PROC PUBLIC
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rdx]
 	mov	r9,QWORD PTR[8+rdx]
 	mov	r10,QWORD PTR[16+rdx]
@@ -1456,6 +1454,7 @@ PUBLIC	blst_sha256_bcopy
 ALIGN	16
 blst_sha256_bcopy	PROC PUBLIC
 	DB	243,15,30,250
+
 	sub	rcx,rdx
 $L$oop_bcopy::
 	movzx	eax,BYTE PTR[rdx]
@@ -1472,6 +1471,7 @@ PUBLIC	blst_sha256_hcopy
 ALIGN	16
 blst_sha256_hcopy	PROC PUBLIC
 	DB	243,15,30,250
+
 	mov	r8,QWORD PTR[rdx]
 	mov	r9,QWORD PTR[8+rdx]
 	mov	r10,QWORD PTR[16+rdx]
@@ -1513,13 +1513,14 @@ ALIGN	4
 .xdata	SEGMENT READONLY ALIGN(8)
 ALIGN	8
 $L$SEH_info_blst_sha256_block_data_order_shaext_prologue::
-DB	1,0,5,00bh
-DB	0,074h,1,0
-DB	0,064h,2,0
-DB	0,003h
-DB	0,0
+DB	1,4,6,005h
+DB	4,074h,2,0
+DB	4,064h,3,0
+DB	4,053h
+DB	1,050h
+	DD	0,0
 $L$SEH_info_blst_sha256_block_data_order_shaext_body::
-DB	1,0,15,0
+DB	1,0,17,85
 DB	000h,068h,000h,000h
 DB	000h,078h,001h,000h
 DB	000h,088h,002h,000h
@@ -1527,43 +1528,47 @@ DB	000h,098h,003h,000h
 DB	000h,0a8h,004h,000h
 DB	000h,074h,00ch,000h
 DB	000h,064h,00dh,000h
-DB	000h,0a2h
+DB	000h,053h
+DB	000h,092h
+DB	000h,050h
 DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue::
-DB	1,0,5,11
+DB	1,0,4,0
 DB	000h,074h,001h,000h
 DB	000h,064h,002h,000h
-DB	000h,003h
-DB	000h,000h
+DB	000h,000h,000h,000h
 
 $L$SEH_info_blst_sha256_block_data_order_prologue::
-DB	1,0,5,00bh
-DB	0,074h,1,0
-DB	0,064h,2,0
-DB	0,003h
-DB	0,0
+DB	1,4,6,005h
+DB	4,074h,2,0
+DB	4,064h,3,0
+DB	4,053h
+DB	1,050h
+	DD	0,0
 $L$SEH_info_blst_sha256_block_data_order_body::
-DB	1,0,26,5
-DB	000h,068h,002h,000h
-DB	000h,078h,003h,000h
-DB	000h,088h,004h,000h
-DB	000h,098h,005h,000h
-DB	000h,0f4h,00dh,000h
-DB	000h,0e4h,00eh,000h
-DB	000h,0d4h,00fh,000h
-DB	000h,0c4h,010h,000h
-DB	000h,034h,011h,000h
-DB	000h,074h,014h,000h
-DB	000h,064h,015h,000h
-DB	000h,003h
-DB	000h,001h,012h,000h
+DB	1,0,25,133
+DB	000h,068h,000h,000h
+DB	000h,078h,001h,000h
+DB	000h,088h,002h,000h
+DB	000h,098h,003h,000h
+DB	000h,0f4h,00bh,000h
+DB	000h,0e4h,00ch,000h
+DB	000h,0d4h,00dh,000h
+DB	000h,0c4h,00eh,000h
+DB	000h,034h,00fh,000h
+DB	000h,074h,012h,000h
+DB	000h,064h,013h,000h
+DB	000h,053h
+DB	000h,0f2h
 DB	000h,050h
+DB	000h,000h,000h,000h,000h,000h
+DB	000h,000h,000h,000h
 $L$SEH_info_blst_sha256_block_data_order_epilogue::
-DB	1,0,5,11
+DB	1,0,4,0
 DB	000h,074h,001h,000h
 DB	000h,064h,002h,000h
-DB	000h,003h
-DB	000h,000h
+DB	000h,000h,000h,000h
 
 
 .xdata	ENDS
diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c
index 81afc530665..4d36f405b64 100644
--- a/crypto/blst_src/bulk_addition.c
+++ b/crypto/blst_src/bulk_addition.c
@@ -145,8 +145,7 @@ static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \
 void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
                                size_t npoints) \
 { \
-    /* Performance with 288K scratch is within 1-2-3% from optimal */ \
-    const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \
+    const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \
     ptype *scratch = alloca((npoints > stride ? stride : npoints) * \
                             sizeof(ptype)); \
     const ptype##_affine *point = NULL; \
@@ -163,6 +162,15 @@ void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \
     } \
 }
 
+#ifndef SCRATCH_LIMIT
+# ifdef __wasm__
+#  define SCRATCH_LIMIT (45 * 1024)
+# else
+   /* Performance with 144K scratch is within 1-2-3% from optimal */
+#  define SCRATCH_LIMIT (144 * 1024)
+# endif
+#endif
+
 ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2)
 
 ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2)
diff --git a/crypto/blst_src/bytes.h b/crypto/blst_src/bytes.h
index af910ba8145..d81ffba5d46 100644
--- a/crypto/blst_src/bytes.h
+++ b/crypto/blst_src/bytes.h
@@ -26,7 +26,7 @@ static inline void limbs_from_be_bytes(limb_t *restrict ret,
          * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
          * to perform redundant stores than to pay penalty for
          * mispredicted branch. Besides, some compilers unroll the
-         * loop and remove redundant stores to 'restict'-ed storage...
+         * loop and remove redundant stores to 'restrict'-ed storage...
          */
         ret[n / sizeof(limb_t)] = limb;
     }
@@ -55,7 +55,7 @@ static inline void limbs_from_le_bytes(limb_t *restrict ret,
          * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper
          * to perform redundant stores than to pay penalty for
          * mispredicted branch. Besides, some compilers unroll the
-         * loop and remove redundant stores to 'restict'-ed storage...
+         * loop and remove redundant stores to 'restrict'-ed storage...
          */
         ret[n / sizeof(limb_t)] = limb;
     }
diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c
new file mode 100644
index 00000000000..0fcf563f502
--- /dev/null
+++ b/crypto/blst_src/client_min_pk.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e2.c"
+#include "hash_to_field.c"
+#include "map_to_g2.c"
+#include "e1.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c
new file mode 100644
index 00000000000..8e4663daede
--- /dev/null
+++ b/crypto/blst_src/client_min_sig.c
@@ -0,0 +1,17 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "keygen.c"
+#include "e1.c"
+#include "hash_to_field.c"
+#include "map_to_g1.c"
+#include "e2.c"
+#include "exp.c"
+#include "sqrt.c"
+#include "recip.c"
+#include "consts.c"
+#include "vect.c"
+#include "exports.c"
diff --git a/crypto/blst_src/cpuid.c b/crypto/blst_src/cpuid.c
new file mode 100644
index 00000000000..43b9229d341
--- /dev/null
+++ b/crypto/blst_src/cpuid.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright Supranational LLC
+ * Licensed under the Apache License, Version 2.0, see LICENSE for details.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32)
+__attribute__((visibility("hidden")))
+#endif
+int __blst_platform_cap = 0;
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)
+
+# if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)
+static void __cpuidex(int info[4], int func, int sub)
+{
+    int eax, ebx, ecx, edx;
+
+    __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+                    : "a"(func), "c"(sub));
+
+    info[0] = eax;
+    info[1] = ebx;
+    info[2] = ecx;
+    info[3] = edx;
+}
+# else
+#  include <intrin.h>
+# endif
+
+# if defined(__GNUC__) || defined(__clang__)
+__attribute__((constructor))
+# endif
+static int __blst_cpuid(void)
+{
+    int info[4], cap = 0;
+
+    __cpuidex(info, 0, 0);
+    if (info[0] > 6) {
+        __cpuidex(info, 7, 0);
+        cap |= (info[1]>>19) & 1; /* ADX */
+        cap |= (info[1]>>28) & 2; /* SHA */
+    }
+
+    __blst_platform_cap = cap;
+
+    return 0;
+}
+
+# if defined(_MSC_VER) && !defined(__clang__)
+#  pragma section(".CRT$XCU",read)
+__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid;
+# elif defined(__SUNPRO_C)
+#  pragma init(__blst_cpuid)
+# endif
+
+#elif defined(__aarch64__) || defined(__aarch64)
+
+# if defined(__linux__) && (defined(__GNUC__) || defined(__clang__))
+extern unsigned long getauxval(unsigned long type) __attribute__ ((weak));
+
+__attribute__((constructor))
+static int __blst_cpuid(void)
+{
+    int cap = 0;
+
+    if (getauxval) {
+        unsigned long hwcap_ce = getauxval(16);
+        cap = (hwcap_ce>>6) & 1; /* SHA256 */
+    }
+
+    __blst_platform_cap = cap;
+
+    return 0;
+}
+# elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__))
+__attribute__((constructor))
+static int __blst_cpuid()
+{
+    __blst_platform_cap = 1; /* SHA256 */
+    return 0;
+}
+# endif
+
+#endif
diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c
index 91c4cdbf39c..f8a7be7bc14 100644
--- a/crypto/blst_src/e1.c
+++ b/crypto/blst_src/e1.c
@@ -155,7 +155,7 @@ void blst_p1_affine_serialize(unsigned char out[96],
 {
     if (vec_is_zero(in->X, 2*sizeof(in->X))) {
         bytes_zero(out, 96);
-        out[0] = 0x40;    /* infinitiy bit */
+        out[0] = 0x40;    /* infinity bit */
     } else {
         (void)POINTonE1_affine_Serialize_BE(out, in);
     }
@@ -178,7 +178,7 @@ static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in)
 {
     if (vec_is_zero(in->Z, sizeof(in->Z))) {
         bytes_zero(out, 96);
-        out[0] = 0x40;    /* infinitiy bit */
+        out[0] = 0x40;    /* infinity bit */
     } else {
         (void)POINTonE1_Serialize_BE(out, in);
     }
@@ -202,7 +202,7 @@ void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in)
 {
     if (vec_is_zero(in->X, 2*sizeof(in->X))) {
         bytes_zero(out, 48);
-        out[0] = 0xc0;    /* compressed and infinitiy bits */
+        out[0] = 0xc0;    /* compressed and infinity bits */
     } else {
         limb_t sign = POINTonE1_affine_Compress_BE(out, in);
         out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
@@ -226,7 +226,7 @@ void blst_p1_compress(unsigned char out[48], const POINTonE1 *in)
 {
     if (vec_is_zero(in->Z, sizeof(in->Z))) {
         bytes_zero(out, 48);
-        out[0] = 0xc0;    /* compressed and infinitiy bits */
+        out[0] = 0xc0;    /* compressed and infinity bits */
     } else {
         limb_t sign = POINTonE1_Compress_BE(out, in);
         out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c
index 822ac23c694..77f8064bce2 100644
--- a/crypto/blst_src/e2.c
+++ b/crypto/blst_src/e2.c
@@ -196,7 +196,7 @@ void blst_p2_affine_serialize(unsigned char out[192],
 {
     if (vec_is_zero(in->X, 2*sizeof(in->X))) {
         bytes_zero(out, 192);
-        out[0] = 0x40;    /* infinitiy bit */
+        out[0] = 0x40;    /* infinity bit */
     } else {
         (void)POINTonE2_affine_Serialize_BE(out, in);
     }
@@ -219,7 +219,7 @@ static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in)
 {
     if (vec_is_zero(in->Z, sizeof(in->Z))) {
         bytes_zero(out, 192);
-        out[0] = 0x40;    /* infinitiy bit */
+        out[0] = 0x40;    /* infinity bit */
     } else {
         (void)POINTonE2_Serialize_BE(out, in);
     }
@@ -245,7 +245,7 @@ void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in)
 {
     if (vec_is_zero(in->X, 2*sizeof(in->X))) {
         bytes_zero(out, 96);
-        out[0] = 0xc0;    /* compressed and infinitiy bits */
+        out[0] = 0xc0;    /* compressed and infinity bits */
     } else {
         limb_t sign = POINTonE2_affine_Compress_BE(out, in);
         out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
@@ -269,7 +269,7 @@ void blst_p2_compress(unsigned char out[96], const POINTonE2 *in)
 {
     if (vec_is_zero(in->Z, sizeof(in->Z))) {
         bytes_zero(out, 96);
-        out[0] = 0xc0;    /* compressed and infinitiy bits */
+        out[0] = 0xc0;    /* compressed and infinity bits */
     } else {
         limb_t sign = POINTonE2_Compress_BE(out, in);
         out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4));
diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h
index 192f7337cbf..3c23489570c 100644
--- a/crypto/blst_src/ec_mult.h
+++ b/crypto/blst_src/ec_mult.h
@@ -46,9 +46,10 @@ static limb_t get_wval_limb(const byte *d, size_t off, size_t bits)
 static limb_t booth_encode(limb_t wval, size_t sz)
 {
     limb_t mask = 0 - (wval >> sz);     /* "sign" bit -> mask */
+    launder(mask);
 
     wval = (wval + 1) >> 1;
-    wval = (wval & ~mask) | ((0-wval) & mask);
+    wval = (wval ^ mask) - mask;
 
     /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */
     return wval;
@@ -61,7 +62,7 @@ static limb_t booth_encode(limb_t wval, size_t sz)
  * pass order's bit-length, which is customarily publicly known, instead
  * of the factual scalars' bit-lengths. This is facilitated by point
  * addition subroutines implemented to handle points at infinity, which
- * are encoded as Z==0. [Doubling agorithms handle such points at
+ * are encoded as Z==0. [Doubling algorithms handle such points at
  * infinity "naturally," since resulting Z is product of original Z.]
  */
 #define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \
diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c
index ad720999883..1ca4d4757fa 100644
--- a/crypto/blst_src/exports.c
+++ b/crypto/blst_src/exports.c
@@ -19,7 +19,7 @@
 #include "bytes.h"
 
 /*
- * BLS12-381-specifc Fr shortcuts to assembly.
+ * BLS12-381-specific Fr shortcuts to assembly.
  */
 void blst_fr_add(vec256 ret, const vec256 a, const vec256 b)
 {   add_mod_256(ret, a, b, BLS12_381_r);   }
@@ -39,6 +39,24 @@ void blst_fr_rshift(vec256 ret, const vec256 a, size_t count)
 void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b)
 {   mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0);   }
 
+void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle)
+{
+    vec256 x2;
+
+    mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0);
+    sub_mod_256(x1, x0, x2, BLS12_381_r);
+    add_mod_256(x0, x0, x2, BLS12_381_r);
+}
+
+void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle)
+{
+    vec256 x2;
+
+    sub_mod_256(x2, x0, x1, BLS12_381_r);
+    add_mod_256(x0, x0, x1, BLS12_381_r);
+    mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0);
+}
+
 void blst_fr_sqr(vec256 ret, const vec256 a)
 {   sqr_mont_sparse_256(ret, a, BLS12_381_r, r0);   }
 
@@ -102,27 +120,26 @@ int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b)
 
 int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b)
 {
-    vec256 a_fr, b_fr;
+    vec256 t[2];
     const union {
         long one;
         char little;
     } is_endian = { 1 };
+    bool_t is_zero;
 
     if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) {
-        limbs_from_le_bytes(a_fr, a, sizeof(a_fr));
-        limbs_from_le_bytes(b_fr, b, sizeof(a_fr));
-        a = (const byte *)a_fr;
-        b = (const byte *)b_fr;
+        limbs_from_le_bytes(t[0], a, sizeof(pow256));
+        limbs_from_le_bytes(t[1], b, sizeof(pow256));
+        a = (const byte *)t[0];
+        b = (const byte *)t[1];
     }
-    mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR,
-                                                 BLS12_381_r, r0);
-    mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR,
-                                                 BLS12_381_r, r0);
-    mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0);
-    from_mont_256(a_fr, a_fr, BLS12_381_r, r0);
-    le_bytes_from_limbs(ret, a_fr, sizeof(a_fr));
-
-    return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1);
+    mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0);
+    mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0);
+    le_bytes_from_limbs(ret, t[0], sizeof(pow256));
+    is_zero = vec_is_zero(t[0], sizeof(vec256));
+    vec_zero(t, sizeof(t));
+
+    return (int)(is_zero^1);
 }
 
 void blst_sk_inverse(pow256 ret, const pow256 a)
@@ -150,7 +167,7 @@ void blst_sk_inverse(pow256 ret, const pow256 a)
 }
 
 /*
- * BLS12-381-specifc Fp shortcuts to assembly.
+ * BLS12-381-specific Fp shortcuts to assembly.
  */
 void blst_fp_add(vec384 ret, const vec384 a, const vec384 b)
 {   add_fp(ret, a, b);   }
@@ -284,7 +301,7 @@ void blst_lendian_from_fp(unsigned char ret[48], const vec384 a)
 }
 
 /*
- * BLS12-381-specifc Fp2 shortcuts to assembly.
+ * BLS12-381-specific Fp2 shortcuts to assembly.
  */
 void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b)
 {   add_fp2(ret, a, b);   }
@@ -311,7 +328,7 @@ void blst_fp2_cneg(vec384x ret, const vec384x a, int flag)
 {   cneg_fp2(ret, a, is_zero(flag) ^ 1);   }
 
 /*
- * Scalar serialization/deseriazation
+ * Scalar serialization/deserialization.
  */
 void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8])
 {
@@ -480,68 +497,75 @@ void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a)
 
 int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n)
 {
-    struct { vec256 out, digit, radix; } t;
+    size_t rem = (n - 1) % 32 + 1;
+    struct { vec256 out, digit; } t;
     limb_t ret;
 
     vec_zero(t.out, sizeof(t.out));
-    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
 
-    while (n > 32) {
-        limbs_from_le_bytes(t.digit, bytes, 32);
-        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
-        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    n -= rem;
+    limbs_from_le_bytes(t.out, bytes += n, rem);
+    mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0);
+
+    while (n) {
+        limbs_from_le_bytes(t.digit, bytes -= 32, 32);
         add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
-        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
-        bytes += 32;
+        mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0);
         n -= 32;
     }
 
-    vec_zero(t.digit, sizeof(t.digit));
-    limbs_from_le_bytes(t.digit, bytes, n);
-    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
-    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
-    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+    from_mont_256(t.out, t.out, BLS12_381_r, r0);
 
     ret = vec_is_zero(t.out, sizeof(t.out));
     le_bytes_from_limbs(out, t.out, 32);
-    vec_zero(t.out, 2*sizeof(t.out));
+    vec_zero(&t, sizeof(t));
 
     return (int)(ret^1);
 }
 
 int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n)
 {
-    struct { vec256 out, digit, radix; } t;
+    size_t rem = (n - 1) % 32 + 1;
+    struct { vec256 out, digit; } t;
     limb_t ret;
 
     vec_zero(t.out, sizeof(t.out));
-    vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix));
 
-    bytes += n;
-    while (n > 32) {
-        limbs_from_be_bytes(t.digit, bytes -= 32, 32);
-        from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
-        mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
+    limbs_from_be_bytes(t.out, bytes, rem);
+    mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0);
+
+    while (n -= rem) {
+        limbs_from_be_bytes(t.digit, bytes += rem, 32);
         add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
-        mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0);
-        n -= 32;
+        mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0);
+        rem = 32;
     }
 
-    vec_zero(t.digit, sizeof(t.digit));
-    limbs_from_be_bytes(t.digit, bytes -= n, n);
-    from_mont_256(t.digit, t.digit, BLS12_381_r, r0);
-    mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0);
-    add_mod_256(t.out, t.out, t.digit, BLS12_381_r);
+    from_mont_256(t.out, t.out, BLS12_381_r, r0);
 
     ret = vec_is_zero(t.out, sizeof(t.out));
     le_bytes_from_limbs(out, t.out, 32);
-    vec_zero(t.out, 2*sizeof(t.out));
+    vec_zero(&t, sizeof(t));
 
     return (int)(ret^1);
 }
 
 /*
- * Test facilitator
+ * Single-short SHA-256 hash function.
+ */
+#include "sha256.h"
+
+void blst_sha256(unsigned char md[32], const void *msg, size_t len)
+{
+    SHA256_CTX ctx;
+
+    sha256_init(&ctx);
+    sha256_update(&ctx, msg, len);
+    sha256_final(md, &ctx);
+}
+
+/*
+ * Test facilitator.
  */
 void blst_scalar_from_hexascii(pow256 ret, const char *hex)
 {   bytes_from_hexascii(ret, sizeof(pow256), hex);   }
diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h
index 515219f62dd..4b2323d2cce 100644
--- a/crypto/blst_src/fields.h
+++ b/crypto/blst_src/fields.h
@@ -10,7 +10,7 @@
 #include "consts.h"
 
 /*
- * BLS12-381-specifc Fp shortcuts to assembly.
+ * BLS12-381-specific Fp shortcuts to assembly.
  */
 static inline void add_fp(vec384 ret, const vec384 a, const vec384 b)
 {   add_mod_384(ret, a, b, BLS12_381_P);   }
@@ -49,7 +49,7 @@ static inline void redc_fp(vec384 ret, const vec768 a)
 {   redc_mont_384(ret, a, BLS12_381_P, p0);   }
 
 /*
- * BLS12-381-specifc Fp2 shortcuts to assembly.
+ * BLS12-381-specific Fp2 shortcuts to assembly.
  */
 static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b)
 {   add_mod_384x(ret, a, b, BLS12_381_P);   }
diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c
index ab247a8ebf0..d6c0b124eb6 100644
--- a/crypto/blst_src/fp12_tower.c
+++ b/crypto/blst_src/fp12_tower.c
@@ -545,7 +545,7 @@ static void inverse_fp6(vec384fp6 ret, const vec384fp6 a)
     mul_by_u_plus_1_fp2(c1, c1);
     mul_fp2(t0, a[0], a[1]);
     sub_fp2(c1, c1, t0);
- 
+
     /* c2 = a1^2 - a0*a2 */
     sqr_fp2(c2, a[1]);
     mul_fp2(t0, a[0], a[2]);
@@ -733,7 +733,7 @@ static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n)
 
 
 /*
- * BLS12-381-specifc Fp12 shortcuts.
+ * BLS12-381-specific Fp12 shortcuts.
  */
 void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a)
 {   sqr_fp12(ret, a);   }
diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c
index d0b3deefe25..55ab8227718 100644
--- a/crypto/blst_src/multi_scalar.c
+++ b/crypto/blst_src/multi_scalar.c
@@ -399,7 +399,20 @@ void prefix##s_mult_pippenger(ptype *ret, \
                               size_t npoints, \
                               const byte *const scalars[], size_t nbits, \
                               ptype##xyzz scratch[]) \
-{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); }
+{ \
+    if (npoints == 1) { \
+        prefix##_from_affine(ret, points[0]); \
+        prefix##_mult(ret, ret, scalars[0], nbits); \
+        return; \
+    } \
+    if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT) { \
+        ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \
+        ptype##s_precompute_wbits(table, 4, points, npoints); \
+        ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \
+        return; \
+    } \
+    ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \
+}
 
 DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384)
 POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp)
diff --git a/crypto/blst_src/pairing.c b/crypto/blst_src/pairing.c
index b256c44d68a..1396bbadd3b 100644
--- a/crypto/blst_src/pairing.c
+++ b/crypto/blst_src/pairing.c
@@ -409,6 +409,55 @@ void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q,
                        P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1);
 }
 
+#ifndef MILLER_LOOP_N_MAX
+# define MILLER_LOOP_N_MAX 16
+#endif
+
+void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[],
+                                        const POINTonE1_affine *const Ps[],
+                                        size_t n)
+{   /* ~10KB of stack storage */
+    POINTonE2 T[MILLER_LOOP_N_MAX];
+    POINTonE2_affine Q[MILLER_LOOP_N_MAX];
+    POINTonE1_affine Px2[MILLER_LOOP_N_MAX];
+    const POINTonE2_affine *Qptr = NULL;
+    const POINTonE1_affine *Pptr = NULL;
+    size_t i, j;
+
+    for (i = 0, j = 0; j < n; j++) {
+        Qptr = *Qs ? *Qs++ : Qptr+1;
+        Pptr = *Ps ? *Ps++ : Pptr+1;
+
+        /* Move common expression from line evaluation to line_by_Px2.  */
+        add_fp(Px2[i].X, Pptr->X, Pptr->X);
+        neg_fp(Px2[i].X, Px2[i].X);
+        add_fp(Px2[i].Y, Pptr->Y, Pptr->Y);
+
+        vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X));
+        vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X));
+        vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z));
+
+        if (++i == MILLER_LOOP_N_MAX || j == n-1) {
+            vec384fp12 tmp;
+            vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp;
+
+            /* first step is ret = 1^2*line, which is just ret = line       */
+            start_dbl_n(ret, T, Px2, i);            /* 0x2                  */
+            add_n_dbl_n(ret, T, Q, Px2, i, 2);      /* ..0xc                */
+            add_n_dbl_n(ret, T, Q, Px2, i, 3);      /* ..0x68               */
+            add_n_dbl_n(ret, T, Q, Px2, i, 9);      /* ..0xd200             */
+            add_n_dbl_n(ret, T, Q, Px2, i, 32);     /* ..0xd20100000000     */
+            add_n_dbl_n(ret, T, Q, Px2, i, 16);     /* ..0xd201000000010000 */
+            conjugate_fp12(ret);            /* account for z being negative */
+
+            if (j >= MILLER_LOOP_N_MAX)
+                mul_fp12(out, out, ret);
+
+            i = 0;
+        }
+    }
+}
+
 void blst_final_exp(vec384fp12 ret, const vec384fp12 f)
 {   final_exp(ret, f);   }
 
diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c
index fd028113f3d..71f334df50a 100644
--- a/crypto/blst_src/pentaroot.c
+++ b/crypto/blst_src/pentaroot.c
@@ -6,10 +6,10 @@
 
 #include "fields.h"
 
-static inline void mul_fr(vec384 ret, const vec384 a, const vec384 b)
+static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b)
 {   mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0);   }
 
-static inline void sqr_fr(vec384 ret, const vec384 a)
+static inline void sqr_fr(vec256 ret, const vec256 a)
 {   sqr_mont_sparse_256(ret, a, BLS12_381_r, r0);   }
 
 #ifdef __OPTIMIZE_SIZE__
diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h
index 3211c8628cf..554dd5daefc 100644
--- a/crypto/blst_src/vect.h
+++ b/crypto/blst_src/vect.h
@@ -61,7 +61,7 @@ typedef unsigned char byte;
 typedef byte pow256[256/8];
 
 /*
- * Internal Boolean type, Bolean by value, hence safe to cast to or
+ * Internal Boolean type, Boolean by value, hence safe to cast to or
  * reinterpret as 'bool'.
  */
 typedef limb_t bool_t;
@@ -147,7 +147,6 @@ bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod);
 # define mul_mont_384x mulx_mont_384x
 # define sqr_mont_384x sqrx_mont_384x
 # define sqr_mont_382x sqrx_mont_382x
-# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x
 # define mul_382x mulx_382x
 # define sqr_382x sqrx_382x
 #endif
@@ -156,8 +155,6 @@ void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b,
                    const vec384 p, limb_t n0);
 void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
 void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0);
-void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count,
-                         const vec384 p, limb_t n0, const vec384x b);
 void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p);
 void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p);
 
@@ -214,7 +211,7 @@ typedef const void *uptr_t;
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
-# define launder(var) asm volatile("" : "+r"(var))
+# define launder(var) __asm__ __volatile__("" : "+r"(var))
 #else
 # define launder(var)
 #endif
@@ -249,9 +246,12 @@ static inline void vec_cswap(void *restrict a, void *restrict b, size_t num,
 {
     limb_t ai, *ap = (limb_t *)a;
     limb_t bi, *bp = (limb_t *)b;
-    limb_t xorm, mask = (limb_t)0 - cbit;
+    limb_t xorm, mask;
     size_t i;
 
+    launder(cbit);
+    mask = (limb_t)0 - cbit;
+
     num /= sizeof(limb_t);
 
     for (i = 0; i < num; i++) {
@@ -377,7 +377,7 @@ static inline void vec_zero(void *ret, size_t num)
         rp[i] = 0;
 
 #if defined(__GNUC__) || defined(__clang__)
-    asm volatile("" : : "r"(ret) : "memory");
+    __asm__ __volatile__("" : : "r"(ret) : "memory");
 #endif
 }
 
@@ -398,7 +398,7 @@ static inline void vec_zero(void *ret, size_t num)
 # pragma warning(disable: 4127 4189)
 #endif
 
-#if !defined(__wasm__)
+#if !defined(__wasm__) && __STDC_HOSTED__-0 != 0
 # include <stdlib.h>
 #endif
 

From 241798f34c764a3af499574986834420e58b45d6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 18:10:51 -0600
Subject: [PATCH 175/200] clean up assembly files include in cgo compilations

---
 crypto/blst_assembly.S           | 117 +------------------------------
 crypto/blst_src/README.md        |   3 +-
 crypto/blst_src/build/assembly.S | 116 ++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 118 deletions(-)
 create mode 100644 crypto/blst_src/build/assembly.S

diff --git a/crypto/blst_assembly.S b/crypto/blst_assembly.S
index c0c5db30850..fb99b3d985e 100644
--- a/crypto/blst_assembly.S
+++ b/crypto/blst_assembly.S
@@ -1,116 +1 @@
-#if defined(__x86_64) || defined(__x86_64__)
-# if defined(__ELF__)
-#  if defined(__BLST_PORTABLE__)
-#   include "elf/sha256-portable-x86_64.s"
-#   define blst_sha256_block_data_order blst_sha256_block_ssse3
-#  endif
-#  include "elf/sha256-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "elf/ctx_inverse_mod_384-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "elf/ctq_inverse_mod_384-x86_64.s"
-#  endif
-#  include "elf/add_mod_384-x86_64.s"
-#  include "elf/add_mod_384x384-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "elf/mulx_mont_384-x86_64.s"
-#   include "elf/mulx_mont_256-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "elf/mulq_mont_384-x86_64.s"
-#   include "elf/mulq_mont_256-x86_64.s"
-#  endif
-#  include "elf/add_mod_256-x86_64.s"
-#  include "elf/ct_inverse_mod_256-x86_64.s"
-#  include "elf/div3w-x86_64.s"
-#  include "elf/ct_is_square_mod_384-x86_64.s"
-# elif defined(_WIN64) || defined(__CYGWIN__)
-#  include "coff/sha256-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "coff/ctx_inverse_mod_384-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "coff/ctq_inverse_mod_384-x86_64.s"
-#  endif
-#  include "coff/add_mod_384-x86_64.s"
-#  include "coff/add_mod_384x384-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "coff/mulx_mont_384-x86_64.s"
-#   include "coff/mulx_mont_256-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "coff/mulq_mont_384-x86_64.s"
-#   include "coff/mulq_mont_256-x86_64.s"
-#  endif
-#  include "coff/add_mod_256-x86_64.s"
-#  include "coff/ct_inverse_mod_256-x86_64.s"
-#  include "coff/div3w-x86_64.s"
-#  include "coff/ct_is_square_mod_384-x86_64.s"
-# elif defined(__APPLE__)
-#  include "mach-o/sha256-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "mach-o/ctx_inverse_mod_384-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "mach-o/ctq_inverse_mod_384-x86_64.s"
-#  endif
-#  include "mach-o/add_mod_384-x86_64.s"
-#  include "mach-o/add_mod_384x384-x86_64.s"
-#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "mach-o/mulx_mont_384-x86_64.s"
-#   include "mach-o/mulx_mont_256-x86_64.s"
-#  endif
-#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
-#   include "mach-o/mulq_mont_384-x86_64.s"
-#   include "mach-o/mulq_mont_256-x86_64.s"
-#  endif
-#  include "mach-o/add_mod_256-x86_64.s"
-#  include "mach-o/ct_inverse_mod_256-x86_64.s"
-#  include "mach-o/div3w-x86_64.s"
-#  include "mach-o/ct_is_square_mod_384-x86_64.s"
-# endif
-#elif defined(__aarch64__)
-# if defined(__ELF__)
-#  include "elf/sha256-armv8.S"
-#  include "elf/ct_inverse_mod_384-armv8.S"
-#  include "elf/add_mod_384-armv8.S"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  include "elf/mul_mont_384-armv8.S"
-#  include "elf/mul_mont_256-armv8.S"
-#  include "elf/add_mod_256-armv8.S"
-#  include "elf/ct_inverse_mod_256-armv8.S"
-#  include "elf/div3w-armv8.S"
-#  include "elf/ct_is_square_mod_384-armv8.S"
-# elif defined(_WIN64)
-#  include "coff/sha256-armv8.S"
-#  include "coff/ct_inverse_mod_384-armv8.S"
-#  include "coff/add_mod_384-armv8.S"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  include "coff/mul_mont_384-armv8.S"
-#  include "coff/mul_mont_256-armv8.S"
-#  include "coff/add_mod_256-armv8.S"
-#  include "coff/ct_inverse_mod_256-armv8.S"
-#  include "coff/div3w-armv8.S"
-#  include "coff/ct_is_square_mod_384-armv8.S"
-# elif defined(__APPLE__)
-#  include "mach-o/sha256-armv8.S"
-#  include "mach-o/ct_inverse_mod_384-armv8.S"
-#  include "mach-o/add_mod_384-armv8.S"
-#  define __add_mod_384     __add_mont_384
-#  define __sub_mod_384     __sub_mont_384
-#  include "mach-o/mul_mont_384-armv8.S"
-#  include "mach-o/mul_mont_256-armv8.S"
-#  include "mach-o/add_mod_256-armv8.S"
-#  include "mach-o/ct_inverse_mod_256-armv8.S"
-#  include "mach-o/div3w-armv8.S"
-#  include "mach-o/ct_is_square_mod_384-armv8.S"
-# endif
-#elif defined(__BLST_NO_ASM__) || \
-      (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4)
-/* inaccurate way to detect a 32-bit processor, but it's close enough */
-#else
-# error "unsupported platform"
-#endif
+#   include "assembly.S"
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index ff63254bbe5..46715d13c2c 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -1,5 +1,5 @@
 All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst, 
-specifically from the tagged version v0.3.11.
+specifically from the tagged version `v0.3.11`.
 
  Copyright Supranational LLC
  Licensed under the Apache License, Version 2.0, see LICENSE for details.
@@ -20,7 +20,6 @@ To upgrade the BLST version:
 - [ ] copy all `.c` and `.h` files from `<blst>/src/` into `./blst_src/`.
 - [ ] delete `./blst_src/server.c`.
 - [ ] copy the folder `<blst>/build/` into this folder `./blst_src`.
-- [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`.
 - [ ] update `./blst_src/blst_src.c` if needed.
 - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `<blst>/bindings/go/blst.go`.
 - [ ] solve all breaking changes that may occur.
diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S
new file mode 100644
index 00000000000..c0c5db30850
--- /dev/null
+++ b/crypto/blst_src/build/assembly.S
@@ -0,0 +1,116 @@
+#if defined(__x86_64) || defined(__x86_64__)
+# if defined(__ELF__)
+#  if defined(__BLST_PORTABLE__)
+#   include "elf/sha256-portable-x86_64.s"
+#   define blst_sha256_block_data_order blst_sha256_block_ssse3
+#  endif
+#  include "elf/sha256-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "elf/ctx_inverse_mod_384-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "elf/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "elf/add_mod_384-x86_64.s"
+#  include "elf/add_mod_384x384-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "elf/mulx_mont_384-x86_64.s"
+#   include "elf/mulx_mont_256-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "elf/mulq_mont_384-x86_64.s"
+#   include "elf/mulq_mont_256-x86_64.s"
+#  endif
+#  include "elf/add_mod_256-x86_64.s"
+#  include "elf/ct_inverse_mod_256-x86_64.s"
+#  include "elf/div3w-x86_64.s"
+#  include "elf/ct_is_square_mod_384-x86_64.s"
+# elif defined(_WIN64) || defined(__CYGWIN__)
+#  include "coff/sha256-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "coff/ctx_inverse_mod_384-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "coff/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "coff/add_mod_384-x86_64.s"
+#  include "coff/add_mod_384x384-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "coff/mulx_mont_384-x86_64.s"
+#   include "coff/mulx_mont_256-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "coff/mulq_mont_384-x86_64.s"
+#   include "coff/mulq_mont_256-x86_64.s"
+#  endif
+#  include "coff/add_mod_256-x86_64.s"
+#  include "coff/ct_inverse_mod_256-x86_64.s"
+#  include "coff/div3w-x86_64.s"
+#  include "coff/ct_is_square_mod_384-x86_64.s"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "mach-o/ctx_inverse_mod_384-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "mach-o/ctq_inverse_mod_384-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_384-x86_64.s"
+#  include "mach-o/add_mod_384x384-x86_64.s"
+#  if defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "mach-o/mulx_mont_384-x86_64.s"
+#   include "mach-o/mulx_mont_256-x86_64.s"
+#  endif
+#  if !defined(__ADX__) || defined(__BLST_PORTABLE__)
+#   include "mach-o/mulq_mont_384-x86_64.s"
+#   include "mach-o/mulq_mont_256-x86_64.s"
+#  endif
+#  include "mach-o/add_mod_256-x86_64.s"
+#  include "mach-o/ct_inverse_mod_256-x86_64.s"
+#  include "mach-o/div3w-x86_64.s"
+#  include "mach-o/ct_is_square_mod_384-x86_64.s"
+# endif
+#elif defined(__aarch64__)
+# if defined(__ELF__)
+#  include "elf/sha256-armv8.S"
+#  include "elf/ct_inverse_mod_384-armv8.S"
+#  include "elf/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "elf/mul_mont_384-armv8.S"
+#  include "elf/mul_mont_256-armv8.S"
+#  include "elf/add_mod_256-armv8.S"
+#  include "elf/ct_inverse_mod_256-armv8.S"
+#  include "elf/div3w-armv8.S"
+#  include "elf/ct_is_square_mod_384-armv8.S"
+# elif defined(_WIN64)
+#  include "coff/sha256-armv8.S"
+#  include "coff/ct_inverse_mod_384-armv8.S"
+#  include "coff/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "coff/mul_mont_384-armv8.S"
+#  include "coff/mul_mont_256-armv8.S"
+#  include "coff/add_mod_256-armv8.S"
+#  include "coff/ct_inverse_mod_256-armv8.S"
+#  include "coff/div3w-armv8.S"
+#  include "coff/ct_is_square_mod_384-armv8.S"
+# elif defined(__APPLE__)
+#  include "mach-o/sha256-armv8.S"
+#  include "mach-o/ct_inverse_mod_384-armv8.S"
+#  include "mach-o/add_mod_384-armv8.S"
+#  define __add_mod_384     __add_mont_384
+#  define __sub_mod_384     __sub_mont_384
+#  include "mach-o/mul_mont_384-armv8.S"
+#  include "mach-o/mul_mont_256-armv8.S"
+#  include "mach-o/add_mod_256-armv8.S"
+#  include "mach-o/ct_inverse_mod_256-armv8.S"
+#  include "mach-o/div3w-armv8.S"
+#  include "mach-o/ct_is_square_mod_384-armv8.S"
+# endif
+#elif defined(__BLST_NO_ASM__) || \
+      (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4)
+/* inaccurate way to detect a 32-bit processor, but it's close enough */
+#else
+# error "unsupported platform"
+#endif

From 3bea523639703ae60e3d02d689444d66e0152517 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 18:29:49 -0600
Subject: [PATCH 176/200] update internal/blst files

---
 crypto/blst_src/README.md       |   9 ++-
 crypto/internal/blst/blst.go    | 113 ++++++++++++++++++++++++++------
 crypto/internal/blst/blst.h     |   9 ++-
 crypto/internal/blst/blst_aux.h |   8 ++-
 4 files changed, 111 insertions(+), 28 deletions(-)

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 46715d13c2c..5f70311c6fd 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -15,14 +15,17 @@ The folder contains:
 - this `README` file.
 
 To upgrade the BLST version:
+- [ ] audit all BLST updated, with focus on `<blst>/src`: https://github.com/supranational/blst/compare/v0.3.11...<new_version>
 - [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`.
+- [ ] delete all files in `./internal/blst/`.
 - [ ] open BLST repository on the new version.
 - [ ] copy all `.c` and `.h` files from `<blst>/src/` into `./blst_src/`.
-- [ ] delete `./blst_src/server.c`.
+- [ ] delete newly copied `./blst_src/server.c`.
 - [ ] copy the folder `<blst>/build/` into this folder `./blst_src`.
-- [ ] update `./blst_src/blst_src.c` if needed.
+- [ ] copy `<blst>/bindings/blst.h`, `<blst>/bindings/blst_aux.h`, and `<blst>/bindings/go/blst.go` into `./internal/blst/.`.
 - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `<blst>/bindings/go/blst.go`.
+- [ ] update `./blst_src/blst_src.c` if needed.
 - [ ] solve all breaking changes that may occur.
 - [ ] update the commit version on this `./blst_src/README`.
 
-Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. 
\ No newline at end of file
+Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto.
diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go
index 037e40d98a3..c890f55e367 100644
--- a/crypto/internal/blst/blst.go
+++ b/crypto/internal/blst/blst.go
@@ -1,16 +1,8 @@
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+// DO NOT EDIT THIS FILE!!
+// The file is generated from *.tgo by generate.py
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 /*
- * This package is equivalent to the BLST Go package including all Go exported
- * functions. BLST outer Go layer is used to cross-check flow-go/crypto BLS implementation.
- * Note that flow-go/crypto uses BLST internal tools only to implement protocols based on BLS12-381,
- * but does not use BLST outer layer and BLS implementation.
- * Ideally, the cross-check tests would import github.com/supranational/blst. However this is
- * not possible in Go as it causes multiple duplicated C objects. Creating the internal blst
- * package is a workaround to achieve the same purpose. Note that the internal package
- * implicitly uses the C objects declared by flow-go/crypto.
- *
- * Note: linter staticcheck was added in two spots to avoid linter false positives.
- *
- * Copied from https://github.com/supranational/blst.
  * Copyright Supranational LLC
  * Licensed under the Apache License, Version 2.0, see LICENSE for details.
  * SPDX-License-Identifier: Apache-2.0
@@ -18,7 +10,7 @@
 
 package blst
 
-// #cgo CFLAGS: -I${SRCDIR} -I${SRCDIR}/../../blst_src/build -I${SRCDIR}/../../blst_src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
+// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset
 // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx
 // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__
 // #include "blst.h"
@@ -132,6 +124,25 @@ package blst
 //     blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len);
 //     return blst_scalar_from_be_bytes(ret, elem, sizeof(elem));
 // }
+// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[],
+//                                              const blst_p1_affine P[],
+//                                              size_t npoints, bool acc)
+// {   const blst_p2_affine *Qs[2] = { Q, NULL };
+//     const blst_p1_affine *Ps[2] = { P, NULL };
+//     if (acc) {
+//         blst_fp12 tmp;
+//         blst_miller_loop_n(&tmp, Qs, Ps, npoints);
+//         blst_fp12_mul(dst, dst, &tmp);
+//     } else {
+//         blst_miller_loop_n(dst, Qs, Ps, npoints);
+//     }
+// }
+// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n)
+// {   size_t i;
+//     blst_fp12_mul(dst, &in[0], &in[1]);
+//     for (i = 2; i < n; i++)
+//         blst_fp12_mul(dst, dst, &in[i]);
+// }
 import "C"
 import (
 	"fmt"
@@ -358,6 +369,64 @@ func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 {
 	return &pt
 }
 
+func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 {
+	if len(qs) != len(ps) || len(qs) == 0 {
+		panic("inputs' lengths mismatch")
+	}
+
+	nElems := uint32(len(qs))
+	nThreads := uint32(maxProcs)
+
+	if nThreads == 1 || nElems == 1 {
+		var pt Fp12
+		C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false)
+		return &pt
+	}
+
+	stride := (nElems + nThreads - 1) / nThreads
+	if stride > 16 {
+		stride = 16
+	}
+
+	strides := (nElems + stride - 1) / stride
+	if nThreads > strides {
+		nThreads = strides
+	}
+
+	msgsCh := make(chan Fp12, nThreads)
+	curElem := uint32(0)
+
+	for tid := uint32(0); tid < nThreads; tid++ {
+		go func() {
+			acc := Fp12One()
+			first := true
+			for {
+				work := atomic.AddUint32(&curElem, stride) - stride
+				if work >= nElems {
+					break
+				}
+				n := nElems - work
+				if n > stride {
+					n = stride
+				}
+				C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n),
+					C.bool(!first))
+				first = false
+			}
+			msgsCh <- acc
+		}()
+	}
+
+	var ret = make([]Fp12, nThreads)
+	for i := range ret {
+		ret[i] = <-msgsCh
+	}
+
+	var pt Fp12
+	C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads))
+	return &pt
+}
+
 func (pt *Fp12) MulAssign(p *Fp12) {
 	C.blst_fp12_mul(pt, pt, p)
 }
@@ -376,6 +445,10 @@ func (pt *Fp12) ToBendian() []byte {
 	return out[:]
 }
 
+func (pt1 *Fp12) Equals(pt2 *Fp12) bool {
+	return *pt1 == *pt2
+}
+
 //
 // MIN-PK
 //
@@ -399,8 +472,10 @@ func (pk *P1Affine) KeyValidate() bool {
 // always cryptographically safe, but application might want
 // to guard against obviously bogus individual[!] signatures.
 func (sig *P2Affine) SigValidate(sigInfcheck bool) bool {
-	return (sigInfcheck && !bool(C.blst_p2_affine_is_inf(sig))) ||
-		bool(C.blst_p2_affine_in_g2(sig))
+	if sigInfcheck && bool(C.blst_p2_affine_is_inf(sig)) {
+		return false
+	}
+	return bool(C.blst_p2_affine_in_g2(sig))
 }
 
 //
@@ -589,7 +664,6 @@ func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool,
 					// main thread has completed its miller loop before
 					// proceeding.
 					mutex.Lock()
-					//nolint:staticcheck
 					mutex.Unlock()
 				}
 
@@ -1018,8 +1092,10 @@ func (pk *P2Affine) KeyValidate() bool {
 // always cryptographically safe, but application might want
 // to guard against obviously bogus individual[!] signatures.
 func (sig *P1Affine) SigValidate(sigInfcheck bool) bool {
-	return (sigInfcheck && !bool(C.blst_p1_affine_is_inf(sig))) ||
-		bool(C.blst_p1_affine_in_g1(sig))
+	if sigInfcheck && bool(C.blst_p1_affine_is_inf(sig)) {
+		return false
+	}
+	return bool(C.blst_p1_affine_in_g1(sig))
 }
 
 //
@@ -1208,7 +1284,6 @@ func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool,
 					// main thread has completed its miller loop before
 					// proceeding.
 					mutex.Lock()
-					//nolint:staticcheck
 					mutex.Unlock()
 				}
 
diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h
index 2e314b3a32e..1349896a3f8 100644
--- a/crypto/internal/blst/blst.h
+++ b/crypto/internal/blst/blst.h
@@ -95,10 +95,6 @@ void blst_fr_sqr(blst_fr *ret, const blst_fr *a);
 void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag);
 void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a);
 void blst_fr_inverse(blst_fr *ret, const blst_fr *a);
-#ifdef BLST_FR_PENTAROOT
-void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a);
-void blst_fr_pentapow(blst_fr *ret, const blst_fr *a);
-#endif
 
 void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]);
 void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a);
@@ -341,6 +337,9 @@ void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash,
 #ifndef SWIG
 void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q,
                                       const blst_p1_affine *P);
+void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[],
+                                        const blst_p1_affine *const Ps[],
+                                        size_t n);
 void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f);
 void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q);
 void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68],
@@ -480,4 +479,4 @@ extern const blst_p2_affine BLS12_381_NEG_G2;
 #ifdef __cplusplus
 }
 #endif
-#endif
\ No newline at end of file
+#endif
diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h
index d96b1f3dd3b..3de0850e330 100644
--- a/crypto/internal/blst/blst_aux.h
+++ b/crypto/internal/blst/blst_aux.h
@@ -10,8 +10,14 @@
  * depending on their proven/unproven worthiness.
  */
 
+void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle);
+void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle);
 void blst_fr_to(blst_fr *ret, const blst_fr *a);
 void blst_fr_from(blst_fr *ret, const blst_fr *a);
+#ifdef BLST_FR_PENTAROOT
+void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a);
+void blst_fr_pentapow(blst_fr *ret, const blst_fr *a);
+#endif
 
 void blst_fp_to(blst_fp *ret, const blst_fp *a);
 void blst_fp_from(blst_fp *ret, const blst_fp *a);
@@ -108,4 +114,4 @@ size_t blst_fp12_sizeof(void);
  * Single-shot SHA-256 hash function.
  */
 void blst_sha256(byte out[32], const byte *msg, size_t msg_len);
-#endif
\ No newline at end of file
+#endif

From c292bc44babd8850e79c0cc7291eba4fc0e93fa6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 7 Sep 2023 18:33:10 -0600
Subject: [PATCH 177/200] fix a readme typo

---
 crypto/blst_src/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 5f70311c6fd..50ca45ea7d6 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -15,7 +15,7 @@ The folder contains:
 - this `README` file.
 
 To upgrade the BLST version:
-- [ ] audit all BLST updated, with focus on `<blst>/src`: https://github.com/supranational/blst/compare/v0.3.11...<new_version>
+- [ ] audit all BLST updates, with focus on `<blst>/src`: https://github.com/supranational/blst/compare/v0.3.11...<new_version>
 - [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`.
 - [ ] delete all files in `./internal/blst/`.
 - [ ] open BLST repository on the new version.

From 0d09d5517e03417e6fee79e11f5a32ae8e9ea892 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 8 Sep 2023 19:22:50 -0600
Subject: [PATCH 178/200] tmp tmate to debug

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b24de2f44ca..db68ffe199c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -111,6 +111,8 @@ jobs:
         cache: true
     - name: Setup tests (${{ matrix.targets.name }})
       run: VERBOSE=1 make -e GO_TEST_PACKAGES="${{ matrix.targets.packages }}" install-tools
+    - name: Setup tmate session
+      uses: mxschmitt/action-tmate@v3
     - name: Run tests (${{ matrix.targets.name }})
       uses: nick-fields/retry@v2
       with:

From dce50f91c09f81efe78776269c0ee99355c95b55 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 12 Sep 2023 13:49:58 -0600
Subject: [PATCH 179/200] make start up node time larger to accommodate failing
 TestClusterSwitchover_MultiCluster

---
 .github/workflows/ci.yml                          | 2 --
 engine/collection/test/cluster_switchover_test.go | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index db68ffe199c..b24de2f44ca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -111,8 +111,6 @@ jobs:
         cache: true
     - name: Setup tests (${{ matrix.targets.name }})
       run: VERBOSE=1 make -e GO_TEST_PACKAGES="${{ matrix.targets.packages }}" install-tools
-    - name: Setup tmate session
-      uses: mxschmitt/action-tmate@v3
     - name: Run tests (${{ matrix.targets.name }})
       uses: nick-fields/retry@v2
       with:
diff --git a/engine/collection/test/cluster_switchover_test.go b/engine/collection/test/cluster_switchover_test.go
index a8f04173099..15a23823ab3 100644
--- a/engine/collection/test/cluster_switchover_test.go
+++ b/engine/collection/test/cluster_switchover_test.go
@@ -212,7 +212,7 @@ func (tc *ClusterSwitchoverTestCase) StartNodes() {
 		nodes = append(nodes, node)
 	}
 
-	unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), time.Second, "could not start nodes")
+	unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes")
 
 	// start continuous delivery for all nodes
 	for _, node := range tc.nodes {

From ba78ef6dc9d9752bf58ddeba450b1a511edb32f9 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 20 Sep 2023 19:50:32 -0600
Subject: [PATCH 180/200] makefile typo

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b471e25ee09..874d56f8d72 100644
--- a/Makefile
+++ b/Makefile
@@ -222,7 +222,7 @@ generate-mocks: install-mock-generators
 tidy:
 	go mod tidy -v
 	cd integration; go mod tidy -v
-	cd crypo; go mod tidy -v
+	cd crypto; go mod tidy -v
 	cd cmd/testclient; go mod tidy -v
 	cd insecure; go mod tidy -v
 	git diff --exit-code

From 8ff9b79d805f64914d39d6af71d300e9c6098ebe Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 20 Sep 2023 19:50:59 -0600
Subject: [PATCH 181/200] clean up C bls12_381 utils

---
 crypto/bls12381_utils.c | 287 +++++++++++++++++++++-------------------
 crypto/blst_include.h   |  12 +-
 2 files changed, 154 insertions(+), 145 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index e4636aad457..9f168e0b3e0 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -11,15 +11,18 @@
 
 // make sure flow crypto types are consistent with BLST types
 void types_sanity(void) {
+  assert(sizeof(vec256) == sizeof(Fr));
   assert(sizeof(Fp) == sizeof(vec384));
+  assert(sizeof(vec384x) == sizeof(Fp2));
   assert(sizeof(E1) == sizeof(POINTonE1));
   assert(sizeof(E2) == sizeof(POINTonE2));
+  assert(sizeof(vec384fp12) == sizeof(Fp12));
 }
-
+  
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
-// R mod r = (1<<256)%r
+// R = (1<<256) mod r
 const Fr BLS12_381_rR = {{
     TO_LIMB_T(0x1824b159acc5056f),
     TO_LIMB_T(0x998c4fefecbc4ff5),
@@ -27,7 +30,7 @@ const Fr BLS12_381_rR = {{
     TO_LIMB_T(0x00000001fffffffe),
 }};
 
-// returns true if a == 0 and false otherwise
+// returns true if a is zero and false otherwise
 bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); }
 
 // returns true if a == b and false otherwise
@@ -75,8 +78,8 @@ void Fr_squ_montg(Fr *res, const Fr *a) {
 
 // res = a*R
 void Fr_to_montg(Fr *res, const Fr *a) {
-  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r,
-                      r0);
+  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, 
+  BLS12_381_rRR, BLS12_381_r, r0);
 }
 
 // res = a*R^(-1)
@@ -101,9 +104,9 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) {
 }
 
 // computes the sum of the array elements and writes the sum in jointx
-void Fr_sum_vector(Fr *jointx, const Fr x[], const int len) {
+void Fr_sum_vector(Fr *jointx, const Fr x[], const int x_len) {
   Fr_set_zero(jointx);
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < x_len; i++) {
     Fr_add(jointx, jointx, &x[i]);
   }
 }
@@ -118,10 +121,10 @@ static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) {
       *(ret++) = *b;
       *(b--) = tmp;
     }
-    return;
-  }
-  for (int i = 0; i < Fr_BYTES; i++) {
-    *(ret++) = *(b--);
+  } else {
+    for (int i = 0; i < Fr_BYTES; i++) {
+      *(ret++) = *(b--);
+    }
   }
 }
 
@@ -136,19 +139,19 @@ static void pow256_from_Fr(pow256 ret, const Fr *in) {
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fr
 //    - VALID if the scalar is valid
-ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) {
-  if (len != Fr_BYTES) {
+ERROR Fr_read_bytes(Fr *a, const byte *in, int in_len) {
+  if (in_len != Fr_BYTES) {
     return BAD_ENCODING;
   }
-  // compare to r using the BLST tool
+  // compare to r using BLST internal function
   pow256 tmp;
-  pow256_from_be_bytes(tmp, bin);
+  pow256_from_be_bytes(tmp, in);
   // (check_mod_256 compares pow256 against a vec256!)
   if (!check_mod_256(tmp, BLS12_381_r)) {
     return BAD_VALUE;
   }
   vec_zero(tmp, sizeof(tmp));
-  limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES);
+  limbs_from_be_bytes((limb_t *)a, in, Fr_BYTES);
   return VALID;
 }
 
@@ -158,8 +161,8 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) {
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fr_star
 //    - VALID if the scalar is valid
-ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) {
-  int ret = Fr_read_bytes(a, bin, len);
+ERROR Fr_star_read_bytes(Fr *a, const byte *in, int in_len) {
+  int ret = Fr_read_bytes(a, in, in_len);
   if (ret != VALID) {
     return ret;
   }
@@ -171,9 +174,9 @@ ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) {
 }
 
 // write Fr element `a` in big endian bytes.
-void Fr_write_bytes(byte *bin, const Fr *a) {
+void Fr_write_bytes(byte *out, const Fr *a) {
   // be_bytes_from_limbs works for both limb endianness types
-  be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES);
+  be_bytes_from_limbs(out, (limb_t *)a, Fr_BYTES);
 }
 
 // maps big-endian bytes of any size into an Fr element using modular reduction.
@@ -181,7 +184,7 @@ void Fr_write_bytes(byte *bin, const Fr *a) {
 //
 // Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p,
 // limb_t n0) to reduce 512 bits at a time.
-static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
+static void Fr_from_be_bytes(Fr *out, const byte *in, const int in_len) {
   // input can be written in base 2^|R|, with R the Montgomery constant
   // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1))
   // Therefore N mod p can be expressed using R as:
@@ -190,7 +193,8 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
   Fr_set_zero(out);
   Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2
 
-  byte *p = (byte *)bytes + n;
+  int n = in_len;
+  byte *p = (byte *)in + in_len;
   while (n > Fr_BYTES) {
     // limbs_from_be_bytes works for both limb endiannesses
     limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i
@@ -214,8 +218,8 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) {
 // Reads a scalar from an array and maps it to Fr using modular reduction.
 // Input is byte-big-endian as used by the external APIs.
 // It returns true if scalar is zero and false otherwise.
-bool map_bytes_to_Fr(Fr *a, const byte *bin, int len) {
-  Fr_from_be_bytes(a, bin, len);
+bool map_bytes_to_Fr(Fr *a, const byte *in, int in_len) {
+  Fr_from_be_bytes(a, in, in_len);
   return Fr_is_zero(a);
 }
 
@@ -262,15 +266,15 @@ static bool Fp_sqrt_montg(Fp *res, const Fp *a) {
   return sqrt_fp((limb_t *)res, (limb_t *)a);
 }
 
-static bool Fp_check(const Fp *in) {
+static bool Fp_check(const Fp *a) {
   // use same method as in BLST internal function
   // which seems the most efficient. The method uses the assembly-based
   // modular addition instead of limbs comparison
   Fp temp;
-  Fp_add(&temp, in, &ZERO_384);
-  return vec_is_equal(&temp, in, Fp_BYTES);
-  // no need to clear `tmp` as no use-case involves sensitive data being passed
-  // as `in`
+  Fp_add(&temp, a, &ZERO_384);
+  return vec_is_equal(&temp, a, Fp_BYTES);
+  // no need to clear `tmp` as no current use-case involves sensitive data being passed
+  // as `a`
 }
 
 // res = a*b*R^(-1)
@@ -293,36 +297,36 @@ void Fp_from_montg(Fp *res, const Fp *a) {
   from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0);
 }
 
-// reads a scalar in `a` and checks it is a valid Fp element (a < p).
+// reads a scalar in `out` and checks it is a valid Fp element (out < p).
 // input is bytes-big-endian.
 // returns:
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fp
 //    - VALID if the scalar is valid
-ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) {
-  if (len != Fp_BYTES) {
+ERROR Fp_read_bytes(Fp *out, const byte *in, int in_len) {
+  if (in_len != Fp_BYTES) {
     return BAD_ENCODING;
   }
-  limbs_from_be_bytes((limb_t *)a, bin, Fp_BYTES);
+  limbs_from_be_bytes((limb_t *)out, in, Fp_BYTES);
   // compare read scalar to p
-  if (!Fp_check(a)) {
+  if (!Fp_check(out)) {
     return BAD_VALUE;
   }
   return VALID;
 }
 
-// write Fp element to `bin`,
-// assuming `bin` has  `Fp_BYTES` allocated bytes.
-void Fp_write_bytes(byte *bin, const Fp *a) {
-  be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES);
+// write Fp element to `out`,
+// assuming `out` has  `Fp_BYTES` allocated bytes.
+void Fp_write_bytes(byte *out, const Fp *a) {
+  be_bytes_from_limbs(out, (limb_t *)a, Fp_BYTES);
 }
 
-// returns the sign of y.
+// returns the sign of y:
 // 1 if y > (p - 1)/2 and 0 otherwise.
-// y is in montgomery form
+// y is in montgomery form!
 static byte Fp_get_sign(const Fp *y) {
-  // BLST's sgn0_pty_mont_384 requires input to be in Montg form.
-  // The needed sign bit is on position 1 !
+  // - BLST's sgn0_pty_mont_384 requires input to be in Montg form.
+  // - The needed sign bit is on position 1
   return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1;
 }
 
@@ -361,18 +365,19 @@ static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) {
 // the square root in `res`.
 //
 // The boolean output is valid whether `a` is in Montgomery form or not,
-// since montgomery constant `R` is a quadratic residue.
-// However, the square root is valid only if `a` is in montgomery form.
+// since montgomery constant `R` is itself a quadratic residue.
+// However, the square root is correct only if `a` is in montgomery form
+// (the square root would be in montgomery form too).
 static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) {
   return sqrt_fp2((vec384 *)res, (vec384 *)a);
 }
 
-// returns the sign of y.
-// sign(y_0) if y_1 = 0, else sign(y_1)
-// y coordinates must be in montgomery form
+// returns the sign of y:
+// sign(y_0) if y_1 = 0, else sign(y_1).
+// y coordinates must be in montgomery form!
 static byte Fp2_get_sign(Fp2 *y) {
-  // BLST's sgn0_pty_mont_384x requires input to be in Montg form.
-  // The needed sign bit is on position 1 !
+  // - BLST's sgn0_pty_mont_384x requires input to be in montgomery form.
+  // - the sign bit is on position 1
   return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1;
 }
 
@@ -383,15 +388,15 @@ static byte Fp2_get_sign(Fp2 *y) {
 //    - BAD_ENCODING if the length is invalid
 //    - BAD_VALUE if the scalar isn't in Fp
 //    - VALID if the scalar is valid
-static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) {
-  if (len != Fp2_BYTES) {
+static ERROR Fp2_read_bytes(Fp2 *a, const byte *in, int in_len) {
+  if (in_len != Fp2_BYTES) {
     return BAD_ENCODING;
   }
-  ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES);
+  ERROR ret = Fp_read_bytes(&real(a), in, Fp_BYTES);
   if (ret != VALID) {
     return ret;
   }
-  ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES);
+  ret = Fp_read_bytes(&imag(a), in + Fp_BYTES, Fp_BYTES);
   if (ret != VALID) {
     return ret;
   }
@@ -399,9 +404,9 @@ static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) {
 }
 
 // write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes.
-void Fp2_write_bytes(byte *bin, const Fp2 *a) {
-  Fp_write_bytes(bin, &real(a));
-  Fp_write_bytes(bin + Fp_BYTES, &imag(a));
+void Fp2_write_bytes(byte *out, const Fp2 *a) {
+  Fp_write_bytes(out, &real(a));
+  Fp_write_bytes(out + Fp_BYTES, &imag(a));
 }
 
 // ------------------- E1 utilities
@@ -419,13 +424,13 @@ bool E1_is_equal(const E1 *p1, const E1 *p2) {
   return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2);
 }
 
-// compare p to infinity
+// compare `p` to infinity
 bool E1_is_infty(const E1 *p) {
   // BLST infinity points are defined by Z=0
   return vec_is_zero(p->z, sizeof(p->z));
 }
 
-// set p to infinity
+// set `p` to infinity
 void E1_set_infty(E1 *p) {
   // BLST infinity points are defined by Z=0
   vec_zero(p->z, sizeof(p->z));
@@ -444,7 +449,7 @@ void E1_to_affine(E1 *res, const E1 *p) {
 
 // checks affine point `p` is in E1
 bool E1_affine_on_curve(const E1 *p) {
-  // BLST's `POINTonE1_affine_on_curve` does not include the inifity case!
+  // BLST's `POINTonE1_affine_on_curve` does not include the infinity case!
   return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p);
 }
 
@@ -452,6 +457,7 @@ bool E1_affine_on_curve(const E1 *p) {
 // It assumes input `p` is on E1.
 bool E1_in_G1(const E1 *p) {
   // currently uses Scott method
+  // TODO: compare to clearing the cofactor using u-1
   return POINTonE1_in_G1((const POINTonE1 *)p);
 }
 
@@ -469,27 +475,27 @@ bool E1_in_G1(const E1 *p) {
 
 // Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z,
 //       but needs to update the logic around G2 subgroup check
-ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
+ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) {
   // check the length
-  if (len != G1_SER_BYTES) {
+  if (in_len != G1_SER_BYTES) {
     return BAD_ENCODING;
   }
 
   // check the compression bit
-  int compressed = bin[0] >> 7;
+  int compressed = in[0] >> 7;
   if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) {
     return BAD_ENCODING;
   }
 
   // check if the point in infinity
-  int is_infinity = bin[0] & 0x40;
+  int is_infinity = in[0] & 0x40;
   if (is_infinity) {
     // the remaining bits need to be cleared
-    if (bin[0] & 0x3F) {
+    if (in[0] & 0x3F) {
       return BAD_ENCODING;
     }
     for (int i = 1; i < G1_SER_BYTES - 1; i++) {
-      if (bin[i]) {
+      if (in[i]) {
         return BAD_ENCODING;
       }
     }
@@ -498,14 +504,14 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
   }
 
   // read the sign bit and check for consistency
-  int y_sign = (bin[0] >> 5) & 1;
+  int y_sign = (in[0] >> 5) & 1;
   if (y_sign && (!compressed)) {
     return BAD_ENCODING;
   }
 
   // use a temporary buffer to mask the header bits and read a.x
   byte temp[Fp_BYTES];
-  memcpy(temp, bin, Fp_BYTES);
+  memcpy(temp, in, Fp_BYTES);
   temp[0] &= 0x1F; // clear the header bits
   ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp));
   if (ret != VALID) {
@@ -517,7 +523,7 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
   Fp_copy(&a->z, &BLS12_381_pR);
 
   if (G1_SERIALIZATION == UNCOMPRESSED) {
-    ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y));
+    ret = Fp_read_bytes(&a->y, in + Fp_BYTES, sizeof(a->y));
     if (ret != VALID) {
       return ret;
     }
@@ -532,13 +538,13 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
   // compute the possible square root
   Fp_squ_montg(&a->y, &a->x);
   Fp_mul_montg(&a->y, &a->y, &a->x); // x^3
-  Fp_add(&a->y, &a->y, &B_E1);       // B_E1 is already in Montg form
+  Fp_add(&a->y, &a->y, &B_E1);       // B_E1 is already in montg form
   // check whether x^3+b is a quadratic residue
   if (!Fp_sqrt_montg(&a->y, &a->y)) {
     return POINT_NOT_ON_CURVE;
   }
 
-  // resulting (x,y) is guaranteed to be on curve (y is already in Montg form)
+  // resulting (x,y) is guaranteed to be on curve (y is already in montg form)
   if (Fp_get_sign(&a->y) != y_sign) {
     Fp_neg(&a->y, &a->y); // flip y sign if needed
   }
@@ -549,27 +555,27 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) {
 // uncompressed form. It assumes buffer is of length G1_SER_BYTES The
 // serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-void E1_write_bytes(byte *bin, const E1 *a) {
+void E1_write_bytes(byte *out, const E1 *a) {
   if (E1_is_infty(a)) {
     // set the infinity bit
-    bin[0] = (G1_SERIALIZATION << 7) | (1 << 6);
-    memset(bin + 1, 0, G1_SER_BYTES - 1);
+    out[0] = (G1_SERIALIZATION << 7) | (1 << 6);
+    memset(out + 1, 0, G1_SER_BYTES - 1);
     return;
   }
   E1 tmp;
   E1_to_affine(&tmp, a);
 
   Fp_from_montg(&tmp.x, &tmp.x);
-  Fp_write_bytes(bin, &tmp.x);
+  Fp_write_bytes(out, &tmp.x);
 
   if (G1_SERIALIZATION == COMPRESSED) {
-    bin[0] |= (Fp_get_sign(&tmp.y) << 5);
+    out[0] |= (Fp_get_sign(&tmp.y) << 5);
   } else {
     Fp_from_montg(&tmp.y, &tmp.y);
-    Fp_write_bytes(bin + Fp_BYTES, &tmp.y);
+    Fp_write_bytes(out + Fp_BYTES, &tmp.y);
   }
   // compression bit
-  bin[0] |= (G1_SERIALIZATION << 7);
+  out[0] |= (G1_SERIALIZATION << 7);
 }
 
 // generic point addition that must handle doubling and points at infinity
@@ -599,28 +605,29 @@ void E1_sum_vector(E1 *sum, const E1 *y, const int len) {
   }
 }
 
-// Computes the sum of input signatures (E1 elements) flattened in a single byte
-// array `sigs_bytes` of `sigs_len` bytes. and writes the sum (E1 element) as
-// bytes in `dest`. The function does not check membership of E1 inputs in G1
+// Computes the sum of input E1 elements flattened in a single byte
+// array `in_bytes` of `in_len` bytes. and writes the sum (E1 element) as
+// bytes in `out`. 
+// The function does not check membership of E1 inputs in G1
 // subgroup. The header is using byte pointers to minimize Cgo calls from the Go
 // layer.
-int E1_sum_vector_byte(byte *dest, const byte *sigs_bytes, const int sigs_len) {
+int E1_sum_vector_byte(byte *out, const byte *in_bytes, const int in_len) {
   int error = UNDEFINED;
   // sanity check that `len` is multiple of `G1_SER_BYTES`
-  if (sigs_len % G1_SER_BYTES) {
+  if (in_len % G1_SER_BYTES) {
     error = INVALID;
     goto mem_error;
   }
-  int n = sigs_len / G1_SER_BYTES; // number of signatures
+  int n = in_len / G1_SER_BYTES; // number of signatures
 
-  E1 *sigs = (E1 *)malloc(n * sizeof(E1));
-  if (!sigs)
+  E1 *vec = (E1 *)malloc(n * sizeof(E1));
+  if (!vec)
     goto mem_error;
 
   // import the points from the array
   for (int i = 0; i < n; i++) {
     // deserialize each point from the input array
-    if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES) !=
+    if (E1_read_bytes(&vec[i], &in_bytes[G1_SER_BYTES * i], G1_SER_BYTES) !=
         VALID) {
       error = INVALID;
       goto out;
@@ -628,12 +635,12 @@ int E1_sum_vector_byte(byte *dest, const byte *sigs_bytes, const int sigs_len) {
   }
   // sum the points
   E1 acc;
-  E1_sum_vector(&acc, sigs, n);
+  E1_sum_vector(&acc, vec, n);
   // export the result
-  E1_write_bytes(dest, &acc);
+  E1_write_bytes(out, &acc);
   error = VALID;
 out:
-  free(sigs);
+  free(vec);
 mem_error:
   return error;
 }
@@ -648,13 +655,13 @@ void G1_mult_gen(E1 *res, const Fr *expo) {
 
 // Reads a scalar bytes and maps it to Fp using modular reduction.
 // output is in Montgomery form.
-// `len` must be less or equal to 96 bytes and must be a multiple of 8.
+// `in_len` must be less or equal to 96 bytes and must be a multiple of 8.
 // This function is only used by `map_to_G1` where input is 64 bytes.
-// input `len` is not checked to satisfy the conditions above.
-static void map_96_bytes_to_Fp(Fp *a, const byte *bin, int len) {
+// input `in_len` is not checked to satisfy the conditions above.
+static void map_96_bytes_to_Fp(Fp *a, const byte *in, int in_len) {
   vec768 tmp;
   vec_zero(&tmp, sizeof(tmp));
-  limbs_from_be_bytes((limb_t *)tmp, bin, len);
+  limbs_from_be_bytes((limb_t *)tmp, in, in_len);
   redc_mont_384((limb_t *)a, tmp, BLS12_381_P, p0); // aR^(-2)
   Fp_mul_montg(a, a, (Fp *)BLS12_381_RRRR);         // aR
 }
@@ -662,16 +669,16 @@ static void map_96_bytes_to_Fp(Fp *a, const byte *bin, int len) {
 // maps bytes input `hash` to G1.
 // `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes)
 // It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf
-int map_to_G1(E1 *h, const byte *hash, const int len) {
+int map_to_G1(E1 *h, const byte *hash, const int hash_len) {
   // sanity check of length
-  if (len != MAP_TO_G1_INPUT_LEN) {
+  if (hash_len != MAP_TO_G1_INPUT_LEN) {
     return INVALID;
   }
   // map to field elements
   Fp u[2];
-  map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN / 2);
-  map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN / 2,
-                     MAP_TO_G1_INPUT_LEN / 2);
+  const int half = MAP_TO_G1_INPUT_LEN / 2;
+  map_96_bytes_to_Fp(&u[0], hash, half);
+  map_96_bytes_to_Fp(&u[1], hash + half, half);
   // map field elements to G1
   // inputs must be in Montgomery form
   map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]);
@@ -692,11 +699,11 @@ void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) {
 
 // maps bytes to a point in E1\G1.
 // `len` must be at least 96 bytes.
-// this is a testing file only, should not be used in any protocol!
-void unsafe_map_bytes_to_G1complement(E1 *p, const byte *bytes, int len) {
-  assert(len >= 96);
+// this is a testing function only, should not be used in any protocol!
+void unsafe_map_bytes_to_G1complement(E1 *p, const byte *in, int in_len) {
+  assert(in_len >= 96);
   Fp u;
-  map_96_bytes_to_Fp(&u, bytes, 96);
+  map_96_bytes_to_Fp(&u, in, 96);
   // map to E1's isogenous and then to E1
   map_to_isogenous_E1((POINTonE1 *)p, u);
   isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p);
@@ -724,27 +731,27 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2;
 //
 // Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z,
 //       and update the logic around G2 subgroup check.
-ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
+ERROR E2_read_bytes(E2 *a, const byte *in, const int in_len) {
   // check the length
-  if (len != G2_SER_BYTES) {
+  if (in_len != G2_SER_BYTES) {
     return BAD_ENCODING;
   }
 
   // check the compression bit
-  int compressed = bin[0] >> 7;
+  int compressed = in[0] >> 7;
   if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) {
     return BAD_ENCODING;
   }
 
   // check if the point in infinity
-  int is_infinity = bin[0] & 0x40;
+  int is_infinity = in[0] & 0x40;
   if (is_infinity) {
     // the remaining bits need to be cleared
-    if (bin[0] & 0x3F) {
+    if (in[0] & 0x3F) {
       return BAD_ENCODING;
     }
     for (int i = 1; i < G2_SER_BYTES - 1; i++) {
-      if (bin[i]) {
+      if (in[i]) {
         return BAD_ENCODING;
       }
     }
@@ -753,14 +760,14 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
   }
 
   // read the sign bit and check for consistency
-  int y_sign = (bin[0] >> 5) & 1;
+  int y_sign = (in[0] >> 5) & 1;
   if (y_sign && (!compressed)) {
     return BAD_ENCODING;
   }
 
   // use a temporary buffer to mask the header bits and read a.x
   byte temp[Fp2_BYTES];
-  memcpy(temp, bin, Fp2_BYTES);
+  memcpy(temp, in, Fp2_BYTES);
   temp[0] &= 0x1F; // clear the header bits
   ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp));
   if (ret != VALID) {
@@ -777,7 +784,7 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
 
   Fp2 *a_y = &(a->y);
   if (G2_SERIALIZATION == UNCOMPRESSED) {
-    ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y));
+    ret = Fp2_read_bytes(a_y, in + Fp2_BYTES, sizeof(a->y));
     if (ret != VALID) {
       return ret;
     }
@@ -808,11 +815,11 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) {
 // uncompressed form. It assumes buffer is of length G2_SER_BYTES The
 // serialization follows:
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
-void E2_write_bytes(byte *bin, const E2 *a) {
+void E2_write_bytes(byte *out, const E2 *a) {
   if (E2_is_infty(a)) {
     // set the infinity bit
-    bin[0] = (G2_SERIALIZATION << 7) | (1 << 6);
-    memset(bin + 1, 0, G2_SER_BYTES - 1);
+    out[0] = (G2_SERIALIZATION << 7) | (1 << 6);
+    memset(out + 1, 0, G2_SER_BYTES - 1);
     return;
   }
   E2 tmp;
@@ -821,18 +828,18 @@ void E2_write_bytes(byte *bin, const E2 *a) {
   Fp2 *t_x = &(tmp.x);
   Fp_from_montg(&real(t_x), &real(t_x));
   Fp_from_montg(&imag(t_x), &imag(t_x));
-  Fp2_write_bytes(bin, t_x);
+  Fp2_write_bytes(out, t_x);
 
   Fp2 *t_y = &(tmp.y);
   if (G2_SERIALIZATION == COMPRESSED) {
-    bin[0] |= (Fp2_get_sign(t_y) << 5);
+    out[0] |= (Fp2_get_sign(t_y) << 5);
   } else {
     Fp_from_montg(&real(t_y), &real(t_y));
     Fp_from_montg(&imag(t_y), &imag(t_y));
-    Fp2_write_bytes(bin + Fp2_BYTES, t_y);
+    Fp2_write_bytes(out + Fp2_BYTES, t_y);
   }
 
-  bin[0] |= (G2_SERIALIZATION << 7);
+  out[0] |= (G2_SERIALIZATION << 7);
 }
 
 // set p to infinity
@@ -940,10 +947,10 @@ void G2_mult_gen(E2 *res, const Fr *expo) {
   vec_zero(&tmp, sizeof(tmp));
 }
 
-// Exponentiation of generator g2 of G2, res = expo.g2
+// Exponentiation of generator g2 of G2, res = expo.g2.
 //
-// This is useful for results being used multiple times in pairings.
-// Conversion to affine saves later pre-pairing conversions.
+// Result is converted to affine. This is useful for results being used multiple 
+// times in pairings. Conversion to affine saves later pre-pairing conversions.
 void G2_mult_gen_to_affine(E2 *res, const Fr *expo) {
   G2_mult_gen(res, expo);
   E2_to_affine(res, res);
@@ -957,9 +964,9 @@ bool E2_in_G2(const E2 *p) {
 }
 
 // computes the sum of the E2 array elements `y[i]` and writes it in `sum`
-void E2_sum_vector(E2 *sum, const E2 *y, const int len) {
+void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) {
   E2_set_infty(sum);
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < y_len; i++) {
     E2_add(sum, sum, &y[i]);
   }
 }
@@ -967,41 +974,41 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int len) {
 // computes the sum of the E2 array elements `y[i]`, converts it
 // to affine coordinates, and writes it in `sum`.
 //
-// This is useful for results being used multiple times in pairings.
-// Conversion to affine saves later pre-pairing conversions.
-void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int len) {
-  E2_sum_vector(sum, y, len);
+// Result is converted to affine. This is useful for results being used multiple 
+// times in pairings. Conversion to affine saves later pre-pairing conversions.
+void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) {
+  E2_sum_vector(sum, y, y_len);
   E2_to_affine(sum, sum);
 }
 
 // Subtracts all G2 array elements `y` from an element `x` and writes the
-// result in res
-void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len) {
-  E2_sum_vector(res, y, len);
+// result in res.
+void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int y_len) {
+  E2_sum_vector(res, y, y_len);
   E2_neg(res, res);
   E2_add(res, x, res);
 }
 
 // maps the bytes to a point in G2.
-// `len` should be at least Fr_BYTES.
+// `in_len` should be at least Fr_BYTES.
 // this is a testing tool only, it should not be used in any protocol!
-void unsafe_map_bytes_to_G2(E2 *p, const byte *bytes, int len) {
-  assert(len >= Fr_BYTES);
+void unsafe_map_bytes_to_G2(E2 *p, const byte *in, int in_len) {
+  assert(in_len >= Fr_BYTES);
   // map to Fr
   Fr log;
-  map_bytes_to_Fr(&log, bytes, len);
+  map_bytes_to_Fr(&log, in, in_len);
   // multiplies G2 generator by a random scalar
   G2_mult_gen(p, &log);
 }
 
-// maps `bytes` to a point in E2\G2 and stores it in p.
+// maps `in` to a point in E2\G2 and stores it in p.
 // `len` should be at least 192.
 // this is a testing tool only, it should not be used in any protocol!
-void unsafe_map_bytes_to_G2complement(E2 *p, const byte *bytes, int len) {
-  assert(len >= 192);
+void unsafe_map_bytes_to_G2complement(E2 *p, const byte *in, int in_len) {
+  assert(in_len >= 192);
   Fp2 u;
-  map_96_bytes_to_Fp(&real(&u), bytes, 96);
-  map_96_bytes_to_Fp(&imag(&u), bytes + 96, 96);
+  map_96_bytes_to_Fp(&real(&u), in, 96);
+  map_96_bytes_to_Fp(&imag(&u), in + 96, 96);
   // map to E2's isogenous and then to E2
   map_to_isogenous_E2((POINTonE2 *)p, u);
   isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p);
@@ -1080,6 +1087,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) {
   final_exp(res_vec, res_vec);
 }
 
+// ------------------- Other utilities
+
 // This is a testing function and is not used in exported functions
 // It uses an expand message XMD based on SHA2-256.
 void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst,
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index dc942b5976b..5a3c47f0260 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -6,16 +6,16 @@
 #include "fields.h"
 #include "point.h"
 
-// types used by the Flow crypto library that are imported from BLST
-// these type definitions are used as an abstraction from BLST internal types
+// types used by the Flow crypto library that are imported from BLST.
+// these type definitions are used as an abstraction from BLST internal types.
 
 // field elements F_r
 // where `r` is the order of G1/G2.
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
 // are represented as a little endian vector of limbs.
 // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r
-// elements). `Fr` is defined as a struct to be exportable through cgo to the Go
-// layer.
+// elements). `Fr` is defined as a struct so that it can be exportable through 
+// cgo to the Go layer.
 #define R_BITS 255 // equal to Fr_bits in bls12381_utils.h
 typedef struct {
   limb_t limbs[(R_BITS + 63) / 64];
@@ -30,7 +30,7 @@ typedef vec384 Fp;
 
 // curve E_1 (over F_p)
 // E_1 points are represented in Jacobian coordinates (x,y,z),
-// where x, y, x are elements of F_p (type `Fp`).
+// where x, y, z are elements of F_p (type `Fp`).
 // `E1` is equivalent to type `POINTonE1` (used internally by BLST for Jacobian
 // E1 elements) `E1` is defined as a struct to be exportable through cgo to the
 // Go layer. `E1` is also used to represent all subgroup G_1 elements.
@@ -49,7 +49,7 @@ typedef vec384x Fp2;
 
 // curve E_2 (over F_p^2)
 // E_2 points are represented in Jacobian coordinates (x,y,z),
-// where x, y, x are elements of F_p (type `Fp`).
+// where x, y, z are elements of F_p (type `Fp`).
 // `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian
 // E2 elements) `E2` is defined as a struct to be exportable through cgo to the
 // Go layer. `E2` is also used to represent all subgroup G_2 elements.

From 2332a61323bd698419834a5c1c8075b34d2f7ddb Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 20 Sep 2023 20:46:39 -0600
Subject: [PATCH 182/200] clean up threshold and dkg C files - use poly degree
 in secret sharing

---
 crypto/bls12381_utils.c         |  1 -
 crypto/bls_core.c               | 26 +++++++++++-----------
 crypto/bls_thresholdsign.go     |  4 ++--
 crypto/bls_thresholdsign_core.c | 38 ++++++++++++++++-----------------
 crypto/dkg_core.c               | 26 ++++++++++++----------
 5 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 9f168e0b3e0..4b2d4ba0cc4 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -457,7 +457,6 @@ bool E1_affine_on_curve(const E1 *p) {
 // It assumes input `p` is on E1.
 bool E1_in_G1(const E1 *p) {
   // currently uses Scott method
-  // TODO: compare to clearing the cofactor using u-1
   return POINTonE1_in_G1((const POINTonE1 *)p);
 }
 
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index aac7d60ee18..83c12480829 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -2,12 +2,10 @@
 
 // this file is about the core functions required by the BLS signature scheme
 
-// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC)
-
-// Computes a BLS signature from a G1 point and writes it in `out`.
+// Compute a BLS signature from a G1 point (not checked) and writes it in `out`.
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
 static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) {
-  // s = h^s
+  // s = h^sk
   E1 s;
   E1_mult(&s, h, sk);
   E1_write_bytes(out, &s);
@@ -15,8 +13,8 @@ static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) {
 
 // Computes a BLS signature from a hash and writes it in `out`.
 // `hash` represents the hashed message with length `hash_len` equal to
-// `MAP_TO_G1_INPUT_LEN`. `out` must be allocated properly with `G1_SER_BYTES`
-// bytes.
+// `MAP_TO_G1_INPUT_LEN`. 
+// `out` must be allocated properly with `G1_SER_BYTES` bytes.
 int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) {
   // hash to G1
   E1 h;
@@ -33,7 +31,8 @@ extern const E2 *BLS12_381_minus_g2;
 // Verifies a BLS signature (G1 point) against a public key (G2 point)
 // and a message hash `h` (G1 point).
 // Hash, signature and public key are assumed to be in G1, G1 and G2
-// respectively. This function only checks the pairing equality.
+// respectively. 
+// This function only checks the pairing equality.
 static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) {
   E1 elemsG1[2];
   E2 elemsG2[2];
@@ -70,8 +69,9 @@ static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) {
 // membership check of pks in G2 is not verified in this function
 // the membership check is separated to allow optimizing multiple verifications
 // using the same pks
-int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes,
-                                 const byte *hashes, const uint32_t *len_hashes,
+int bls_verifyPerDistinctMessage(const byte *sig, 
+                                 const int nb_hashes, const byte *hashes, 
+                                 const uint32_t *len_hashes,
                                  const uint32_t *pks_per_hash, const E2 *pks) {
 
   int ret = UNDEFINED; // return value
@@ -148,8 +148,8 @@ int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes,
 // the membership check is separated to allow optimizing multiple verifications
 // using the same pks
 int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks,
-                             const uint32_t *hashes_per_pk, const byte *hashes,
-                             const uint32_t *len_hashes) {
+                             const uint32_t *hashes_per_pk, 
+                             const byte *hashes, const uint32_t *len_hashes) {
 
   int ret = UNDEFINED; // return value
 
@@ -464,8 +464,8 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple
 // verifications using the same public keys.
-int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2,
-                     const byte *sig2) {
+int bls_spock_verify(const E2 *pk1, const byte *sig1, 
+                      const E2 *pk2, const byte *sig2) {
   E1 elemsG1[2];
   E2 elemsG2[2];
 
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index 83fb6d6949f..c6ad1facd97 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -416,7 +416,7 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat
 	result := C.E1_lagrange_interpolate_at_zero_write(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&shares[0]),
-		(*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1))
+		(*C.uint8_t)(&signers[0]), (C.int)(s.threshold))
 
 	if result != valid {
 		return nil, invalidSignatureError
@@ -508,7 +508,7 @@ func BLSReconstructThresholdSignature(size int, threshold int,
 	if C.E1_lagrange_interpolate_at_zero_write(
 		(*C.uchar)(&thresholdSignature[0]),
 		(*C.uchar)(&flatShares[0]),
-		(*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1),
+		(*C.uint8_t)(&indexSigners[0]), (C.int)(threshold),
 	) != valid {
 		return nil, invalidSignatureError
 	}
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index dc7e1354907..7bbe526121a 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -6,9 +6,10 @@
 
 // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range
 // [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the
-// polynomial P. `len` is equal to `t+1` where `t` is the polynomial degree.
+// polynomial P. 
+// `degree` is equal to the polynomial degree `t`.
 static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
-                                      const byte indices[], const int len) {
+                                      const byte indices[], const int degree) {
 
   // coefficient is computed as N * D^(-1)
   Fr numerator;   // eventually would represent N*R^k
@@ -24,15 +25,14 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
 // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately
 // 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64
 // bits) without overflowing.
-#define MAX_IND_LOOPS (64 / MAX_IND_BITS)
-  const int loops = MAX_IND_LOOPS;
+  const int loops = 64 / MAX_IND_BITS;
   int k, j = 0;
   Fr tmp;
-  while (j < len) {
+  while (j < degree+1) {
     limb_t limb_numerator = 1;
     limb_t limb_denominator = 1;
-    for (k = j; j < MIN(len, k + loops);
-         j++) { // batch up to `loops` elements in one limb
+    // batch up to `loops` elements in one limb
+    for (k = j; j < MIN(degree+1, k + loops); j++) { 
       if (j == i)
         continue;
       if (indices[j] < indices[i]) {
@@ -65,11 +65,11 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
 
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the
 // indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)],
-// and stores the resulting G1 point in `dest`. `len` is equal to `t+1` where
-// `t` is the polynomial degree.
+// and stores the resulting G1 point in `dest`. 
+// `degree` is equal to the polynomial degree `t`.
 static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
                                             const byte indices[],
-                                            const int len) {
+                                            const int degree) {
   // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... +  A_t*x^t in G1
   // where A_i = g1 ^ a_i
 
@@ -79,22 +79,22 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
   E1_set_infty(out);
   Fr fr_lagr_coef;
   E1 mult;
-  for (int i = 0; i < len; i++) {
-    Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len);
+  for (int i = 0; i < degree+1; i++) {
+    Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, degree);
     E1_mult(&mult, &shares[i], &fr_lagr_coef);
     E1_add(out, out, &mult);
   }
 }
 
-// Computes the Langrange interpolation at zero LI(0) with regards to the
+// Computes the Lagrange interpolation at zero LI(0) with regards to the
 // indices [indices(0)..indices(t)] and writes their E1 concatenated
-// serializations [shares(1)..shares(t+1)] in `dest`. `len` is equal to `t+1`
-// where `t` is the polynomial degree.
+// serializations [shares(1)..shares(t+1)] in `dest`. 
+// `degree` is equal to the polynomial degree `t`.
 int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares,
-                                          const byte indices[], const int len) {
+                                          const byte indices[], const int degree) {
   int read_ret;
-  E1 *E1_shares = malloc(sizeof(E1) * len);
-  for (int i = 0; i < len; i++) {
+  E1 *E1_shares = malloc(sizeof(E1) * (degree+1));
+  for (int i = 0; i < degree+1; i++) {
     read_ret =
         E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
     if (read_ret != VALID) {
@@ -106,7 +106,7 @@ int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares,
   // computes Q(x) = A_0 + A_1*x + ... +  A_t*x^t  in G1,
   // where A_i = g1 ^ a_i
   E1 res;
-  E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len);
+  E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, degree);
   // export the result
   E1_write_bytes(dest, &res);
   read_ret = VALID;
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index f5f48db67ae..3dab93b9fc7 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -13,8 +13,8 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree,
 
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
 // a_i are all in Fr, `degree` is P's degree, x is a small integer less than
-// 255. The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non
-// NULL
+// `MAX_IND` (currently 255). 
+// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL.
 void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree,
                          const byte x) {
   Fr_set_zero(image);
@@ -34,7 +34,9 @@ void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree,
 }
 
 // computes Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2
-// and stores the point in y
+// and stores the point in y.
+//  - A_i being G2 points
+//  - x being a small scalar (less than `MAX_IND`)
 static void E2_polynomial_image(E2 *y, const E2 *A, const int degree,
                                 const byte x) {
   E2_set_infty(y);
@@ -45,7 +47,9 @@ static void E2_polynomial_image(E2 *y, const E2 *A, const int degree,
 }
 
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
-// where Q(x) = A_0 + A_1*x + ... +  A_n*x^n  in G2[X]
+// where Q(x) = A_0 + A_1*x + ... +  A_n*x^n 
+//  - A_i being G2 points
+//  - x being a small scalar (less than `MAX_IND`)
 void E2_polynomial_images(E2 *y, const int len_y, const E2 *A,
                           const int degree) {
   for (byte i = 0; i < len_y; i++) {
@@ -56,17 +60,17 @@ void E2_polynomial_images(E2 *y, const int len_y, const E2 *A,
 
 // export an array of G2 into an array of bytes by concatenating
 // all serializations of G2 points in order.
-// the array must be of length (len * G2_SER_BYTES).
-void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
+// the array must be of length (A_len * G2_SER_BYTES).
+void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) {
   byte *p = out;
-  for (int i = 0; i < len; i++) {
+  for (int i = 0; i < A_len; i++) {
     E2_write_bytes(p, &A[i]);
     p += G2_SER_BYTES;
   }
 }
 
-// The function imports an array of `n` E2 points from a concatenated array of
-// bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES).
+// The function imports an array of `A_len` E2 points from a concatenated array of
+// bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES).
 //
 // If return is `VALID`, output vector is guaranteed to be in G2.
 // It returns other errors if at least one input isn't a serialization of a E2
@@ -80,9 +84,9 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) {
 //    E2.
 //    - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2.
 //    - VALID if deserialization of all points to G2 is valid.
-ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) {
+ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int A_len) {
   byte *p = (byte *)src;
-  for (int i = 0; i < n; i++) {
+  for (int i = 0; i < A_len; i++) {
     int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES);
     if (read_ret != VALID) {
       return read_ret;

From 2cd3d283705e2628b89a4d6920ad5f010690c260 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 20 Sep 2023 21:03:03 -0600
Subject: [PATCH 183/200] format c files

---
 crypto/bls12381_utils.c         | 18 +++++++++---------
 crypto/bls12381_utils.h         |  5 ++---
 crypto/bls_core.c               | 17 ++++++++---------
 crypto/bls_thresholdsign_core.c | 25 +++++++++++++------------
 crypto/blst_include.h           |  2 +-
 crypto/dkg_core.c               | 14 +++++++-------
 crypto/dkg_include.h            |  2 +-
 7 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 4b2d4ba0cc4..528f865cfdd 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -1,6 +1,6 @@
 // this file contains utility functions for the curve BLS 12-381
 // these tools are shared by the BLS signature scheme, the BLS based threshold
-// signature and the BLS distributed key generation protocols
+// signature, BLS-SPoCK and the BLS distributed key generation protocols
 
 #include "bls12381_utils.h"
 #include "assert.h"
@@ -18,7 +18,7 @@ void types_sanity(void) {
   assert(sizeof(E2) == sizeof(POINTonE2));
   assert(sizeof(vec384fp12) == sizeof(Fp12));
 }
-  
+
 // ------------------- Fr utilities
 
 // Montgomery constant R related to the curve order r
@@ -78,8 +78,8 @@ void Fr_squ_montg(Fr *res, const Fr *a) {
 
 // res = a*R
 void Fr_to_montg(Fr *res, const Fr *a) {
-  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, 
-  BLS12_381_rRR, BLS12_381_r, r0);
+  mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r,
+                      r0);
 }
 
 // res = a*R^(-1)
@@ -273,8 +273,8 @@ static bool Fp_check(const Fp *a) {
   Fp temp;
   Fp_add(&temp, a, &ZERO_384);
   return vec_is_equal(&temp, a, Fp_BYTES);
-  // no need to clear `tmp` as no current use-case involves sensitive data being passed
-  // as `a`
+  // no need to clear `tmp` as no current use-case involves sensitive data being
+  // passed as `a`
 }
 
 // res = a*b*R^(-1)
@@ -606,7 +606,7 @@ void E1_sum_vector(E1 *sum, const E1 *y, const int len) {
 
 // Computes the sum of input E1 elements flattened in a single byte
 // array `in_bytes` of `in_len` bytes. and writes the sum (E1 element) as
-// bytes in `out`. 
+// bytes in `out`.
 // The function does not check membership of E1 inputs in G1
 // subgroup. The header is using byte pointers to minimize Cgo calls from the Go
 // layer.
@@ -948,7 +948,7 @@ void G2_mult_gen(E2 *res, const Fr *expo) {
 
 // Exponentiation of generator g2 of G2, res = expo.g2.
 //
-// Result is converted to affine. This is useful for results being used multiple 
+// Result is converted to affine. This is useful for results being used multiple
 // times in pairings. Conversion to affine saves later pre-pairing conversions.
 void G2_mult_gen_to_affine(E2 *res, const Fr *expo) {
   G2_mult_gen(res, expo);
@@ -973,7 +973,7 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) {
 // computes the sum of the E2 array elements `y[i]`, converts it
 // to affine coordinates, and writes it in `sum`.
 //
-// Result is converted to affine. This is useful for results being used multiple 
+// Result is converted to affine. This is useful for results being used multiple
 // times in pairings. Conversion to affine saves later pre-pairing conversions.
 void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) {
   E2_sum_vector(sum, y, y_len);
diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h
index b0f96669ed7..923208ef3f3 100644
--- a/crypto/bls12381_utils.h
+++ b/crypto/bls12381_utils.h
@@ -1,6 +1,6 @@
 // this file contains utility functions for the curve BLS 12-381
 // these tools are shared by the BLS signature scheme, the BLS based threshold
-// signature and the BLS distributed key generation protocols
+// signature, BLS-SPoCK and the BLS distributed key generation protocols
 
 #ifndef _BLS12_381_UTILS_H
 #define _BLS12_381_UTILS_H
@@ -101,8 +101,7 @@ void unsafe_map_bytes_to_G1(E1 *, const byte *, int);
 void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int);
 
 #define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8))
-int map_to_G1(E1 *, const byte *,
-              const int); // functions in bls12381_hashtocurve.c
+int map_to_G1(E1 *, const byte *, const int);
 
 // E2 and G2 utilities
 void E2_set_infty(E2 *p);
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 83c12480829..19d29f46713 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -13,7 +13,7 @@ static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) {
 
 // Computes a BLS signature from a hash and writes it in `out`.
 // `hash` represents the hashed message with length `hash_len` equal to
-// `MAP_TO_G1_INPUT_LEN`. 
+// `MAP_TO_G1_INPUT_LEN`.
 // `out` must be allocated properly with `G1_SER_BYTES` bytes.
 int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) {
   // hash to G1
@@ -31,7 +31,7 @@ extern const E2 *BLS12_381_minus_g2;
 // Verifies a BLS signature (G1 point) against a public key (G2 point)
 // and a message hash `h` (G1 point).
 // Hash, signature and public key are assumed to be in G1, G1 and G2
-// respectively. 
+// respectively.
 // This function only checks the pairing equality.
 static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) {
   E1 elemsG1[2];
@@ -69,9 +69,8 @@ static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) {
 // membership check of pks in G2 is not verified in this function
 // the membership check is separated to allow optimizing multiple verifications
 // using the same pks
-int bls_verifyPerDistinctMessage(const byte *sig, 
-                                 const int nb_hashes, const byte *hashes, 
-                                 const uint32_t *len_hashes,
+int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes,
+                                 const byte *hashes, const uint32_t *len_hashes,
                                  const uint32_t *pks_per_hash, const E2 *pks) {
 
   int ret = UNDEFINED; // return value
@@ -148,8 +147,8 @@ int bls_verifyPerDistinctMessage(const byte *sig,
 // the membership check is separated to allow optimizing multiple verifications
 // using the same pks
 int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks,
-                             const uint32_t *hashes_per_pk, 
-                             const byte *hashes, const uint32_t *len_hashes) {
+                             const uint32_t *hashes_per_pk, const byte *hashes,
+                             const uint32_t *len_hashes) {
 
   int ret = UNDEFINED; // return value
 
@@ -464,8 +463,8 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
 // Membership check in G2 of both keys is not verified in this function.
 // the membership check in G2 is separated to allow optimizing multiple
 // verifications using the same public keys.
-int bls_spock_verify(const E2 *pk1, const byte *sig1, 
-                      const E2 *pk2, const byte *sig2) {
+int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2,
+                     const byte *sig2) {
   E1 elemsG1[2];
   E2 elemsG2[2];
 
diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c
index 7bbe526121a..7c1d809d228 100644
--- a/crypto/bls_thresholdsign_core.c
+++ b/crypto/bls_thresholdsign_core.c
@@ -6,7 +6,7 @@
 
 // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range
 // [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the
-// polynomial P. 
+// polynomial P.
 // `degree` is equal to the polynomial degree `t`.
 static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
                                       const byte indices[], const int degree) {
@@ -22,17 +22,17 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
   // sign of D: 0 for positive and 1 for negative
   int sign = 0;
 
-// the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately
-// 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64
-// bits) without overflowing.
+  // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately
+  // 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64
+  // bits) without overflowing.
   const int loops = 64 / MAX_IND_BITS;
   int k, j = 0;
   Fr tmp;
-  while (j < degree+1) {
+  while (j < degree + 1) {
     limb_t limb_numerator = 1;
     limb_t limb_denominator = 1;
     // batch up to `loops` elements in one limb
-    for (k = j; j < MIN(degree+1, k + loops); j++) { 
+    for (k = j; j < MIN(degree + 1, k + loops); j++) {
       if (j == i)
         continue;
       if (indices[j] < indices[i]) {
@@ -65,7 +65,7 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i,
 
 // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the
 // indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)],
-// and stores the resulting G1 point in `dest`. 
+// and stores the resulting G1 point in `dest`.
 // `degree` is equal to the polynomial degree `t`.
 static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
                                             const byte indices[],
@@ -79,7 +79,7 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
   E1_set_infty(out);
   Fr fr_lagr_coef;
   E1 mult;
-  for (int i = 0; i < degree+1; i++) {
+  for (int i = 0; i < degree + 1; i++) {
     Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, degree);
     E1_mult(&mult, &shares[i], &fr_lagr_coef);
     E1_add(out, out, &mult);
@@ -88,13 +88,14 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[],
 
 // Computes the Lagrange interpolation at zero LI(0) with regards to the
 // indices [indices(0)..indices(t)] and writes their E1 concatenated
-// serializations [shares(1)..shares(t+1)] in `dest`. 
+// serializations [shares(1)..shares(t+1)] in `dest`.
 // `degree` is equal to the polynomial degree `t`.
 int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares,
-                                          const byte indices[], const int degree) {
+                                          const byte indices[],
+                                          const int degree) {
   int read_ret;
-  E1 *E1_shares = malloc(sizeof(E1) * (degree+1));
-  for (int i = 0; i < degree+1; i++) {
+  E1 *E1_shares = malloc(sizeof(E1) * (degree + 1));
+  for (int i = 0; i < degree + 1; i++) {
     read_ret =
         E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES);
     if (read_ret != VALID) {
diff --git a/crypto/blst_include.h b/crypto/blst_include.h
index 5a3c47f0260..d5eb5079cfd 100644
--- a/crypto/blst_include.h
+++ b/crypto/blst_include.h
@@ -14,7 +14,7 @@
 // F_r elements are represented as big numbers reduced modulo `r`. Big numbers
 // are represented as a little endian vector of limbs.
 // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r
-// elements). `Fr` is defined as a struct so that it can be exportable through 
+// elements). `Fr` is defined as a struct so that it can be exportable through
 // cgo to the Go layer.
 #define R_BITS 255 // equal to Fr_bits in bls12381_utils.h
 typedef struct {
diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c
index 3dab93b9fc7..c8fee6917f6 100644
--- a/crypto/dkg_core.c
+++ b/crypto/dkg_core.c
@@ -13,7 +13,7 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree,
 
 // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n  where P is in Fr[X].
 // a_i are all in Fr, `degree` is P's degree, x is a small integer less than
-// `MAX_IND` (currently 255). 
+// `MAX_IND` (currently 255).
 // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL.
 void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree,
                          const byte x) {
@@ -47,7 +47,7 @@ static void E2_polynomial_image(E2 *y, const E2 *A, const int degree,
 }
 
 // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y)
-// where Q(x) = A_0 + A_1*x + ... +  A_n*x^n 
+// where Q(x) = A_0 + A_1*x + ... +  A_n*x^n
 //  - A_i being G2 points
 //  - x being a small scalar (less than `MAX_IND`)
 void E2_polynomial_images(E2 *y, const int len_y, const E2 *A,
@@ -58,10 +58,10 @@ void E2_polynomial_images(E2 *y, const int len_y, const E2 *A,
   }
 }
 
-// export an array of G2 into an array of bytes by concatenating
-// all serializations of G2 points in order.
+// export an array of E2 into an array of bytes by concatenating
+// all serializations of E2 points in order.
 // the array must be of length (A_len * G2_SER_BYTES).
-void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) {
+void E2_vector_write_bytes(byte *out, const E2 *A, const int A_len) {
   byte *p = out;
   for (int i = 0; i < A_len; i++) {
     E2_write_bytes(p, &A[i]);
@@ -69,8 +69,8 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) {
   }
 }
 
-// The function imports an array of `A_len` E2 points from a concatenated array of
-// bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES).
+// The function imports an array of `A_len` E2 points from a concatenated array
+// of bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES).
 //
 // If return is `VALID`, output vector is guaranteed to be in G2.
 // It returns other errors if at least one input isn't a serialization of a E2
diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h
index 05d46187749..02fb9a363f4 100644
--- a/crypto/dkg_include.h
+++ b/crypto/dkg_include.h
@@ -8,7 +8,7 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int deg,
 void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg,
                          const byte x);
 void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg);
-void G2_vector_write_bytes(byte *out, const E2 *A, const int len);
+void E2_vector_write_bytes(byte *out, const E2 *A, const int len);
 ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len);
 bool G2_check_log(const Fr *x, const E2 *y);
 

From 262c3e0f332153b6e15f113f7cf9ff05fcc5a50d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Wed, 20 Sep 2023 21:03:22 -0600
Subject: [PATCH 184/200] rename G2_ prefix to E2_

---
 crypto/bls.go            | 7 ++++---
 crypto/bls12381_utils.go | 8 ++++----
 crypto/dkg_feldmanvss.go | 2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index c6f01a6ab28..c33a90fdce6 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -37,11 +37,12 @@ import (
 )
 
 const (
-	// SignatureLenBLSBLS12381 is the size of a `G_1` element.
+	// SignatureLenBLSBLS12381 is the serialization size of a `G_1` element.
 	SignatureLenBLSBLS12381 = g1BytesLen
-	// PubKeyLenBLSBLS12381 is the size of a `G_2` element.
+	// PubKeyLenBLSBLS12381 is the serialization size of a `G_2` element.
 	PubKeyLenBLSBLS12381 = g2BytesLen
-	// PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`.
+	// PrKeyLenBLSBLS12381 is the serialization size of a `F_r` element,
+	// where `r` is the order of `G_1` and `G_2`.
 	PrKeyLenBLSBLS12381 = frBytesLen
 
 	// Hash to curve params
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index a3867b31b20..adfde987cfe 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -61,8 +61,8 @@ const (
 )
 
 // header of the point at infinity serializations
-var g1SerHeader byte // g1
-var g2SerHeader byte // g2
+var g1SerHeader byte // g1 (G1 identity)
+var g2SerHeader byte // g2 (G2 identity)
 
 // `g1“ serialization
 var g1Serialization []byte
@@ -214,7 +214,7 @@ func readScalarFrStar(a *scalar, src []byte) error {
 		return invalidInputsErrorf("input length must be %d, got %d",
 			frBytesLen, len(src))
 	case badValue:
-		return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve")
+		return invalidInputsErrorf("scalar is not in the correct range")
 	default:
 		return invalidInputsErrorf("reading the scalar failed")
 	}
@@ -233,7 +233,7 @@ func readPointE2(a *pointE2, src []byte) error {
 	case valid:
 		return nil
 	case badEncoding, badValue:
-		return invalidInputsErrorf("input could not deserialize to a E2 point")
+		return invalidInputsErrorf("input could not deserialize to an E2 point")
 	case pointNotOnCurve:
 		return invalidInputsErrorf("input is not a point on curve E2")
 	default:
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 2814e59ee14..3ce7f609f95 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -450,7 +450,7 @@ func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) {
 // writeVerifVector exports a vector A into an array of bytes
 // assuming the array length matches the vector length
 func writeVerifVector(dest []byte, A []pointE2) {
-	C.G2_vector_write_bytes((*C.uchar)(&dest[0]),
+	C.E2_vector_write_bytes((*C.uchar)(&dest[0]),
 		(*C.E2)(&A[0]),
 		(C.int)(len(A)),
 	)

From dc28e03c607b3afb23c8c8ff56c249211eb74c08 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 10 Oct 2023 14:15:02 -0500
Subject: [PATCH 185/200] move godoc closer to the type definition

---
 crypto/bls.go               | 11 ++++-------
 crypto/bls12381_utils.go    |  4 +++-
 crypto/bls_thresholdsign.go |  2 +-
 crypto/dkg_feldmanvss.go    |  2 +-
 crypto/ecdsa.go             | 10 ++++------
 5 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/crypto/bls.go b/crypto/bls.go
index c33a90fdce6..27ddd881bfd 100644
--- a/crypto/bls.go
+++ b/crypto/bls.go
@@ -301,7 +301,6 @@ func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey,
 	sk := newPrKeyBLSBLS12381(nil)
 
 	err := readScalarFrStar(&sk.scalar, privateKeyBytes)
-
 	if err != nil {
 		return nil, fmt.Errorf("failed to read the private key: %w", err)
 	}
@@ -347,9 +346,6 @@ func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (Publ
 }
 
 // prKeyBLSBLS12381 is the private key of BLS using BLS12_381, it implements PrivateKey
-
-var _ PrivateKey = (*prKeyBLSBLS12381)(nil)
-
 type prKeyBLSBLS12381 struct {
 	// public key
 	pk *pubKeyBLSBLS12381
@@ -357,6 +353,8 @@ type prKeyBLSBLS12381 struct {
 	scalar scalar
 }
 
+var _ PrivateKey = (*prKeyBLSBLS12381)(nil)
+
 // newPrKeyBLSBLS12381 creates a new BLS private key with the given scalar.
 // If no scalar is provided, the function allocates an
 // empty scalar.
@@ -427,9 +425,6 @@ func (sk *prKeyBLSBLS12381) String() string {
 
 // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381,
 // it implements PublicKey.
-
-var _ PublicKey = (*pubKeyBLSBLS12381)(nil)
-
 type pubKeyBLSBLS12381 struct {
 	// The package guarantees an instance is only created with a point
 	// on the correct G2 subgroup. No membership check is needed when the
@@ -446,6 +441,8 @@ type pubKeyBLSBLS12381 struct {
 	isIdentity bool
 }
 
+var _ PublicKey = (*pubKeyBLSBLS12381)(nil)
+
 // newPubKeyBLSBLS12381 creates a new BLS public key with the given point.
 // If no scalar is provided, the function allocates an
 // empty scalar.
diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index adfde987cfe..41937bc18c2 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -64,7 +64,7 @@ const (
 var g1SerHeader byte // g1 (G1 identity)
 var g2SerHeader byte // g2 (G2 identity)
 
-// `g1“ serialization
+// `g1` serialization
 var g1Serialization []byte
 
 var g2PublicKey pubKeyBLSBLS12381
@@ -89,12 +89,14 @@ func initBLS12381() {
 	g2PublicKey.isIdentity = true
 }
 
+// String returns a hex-encoded representation of the scalar.
 func (a *scalar) String() string {
 	encoding := make([]byte, frBytesLen)
 	writeScalar(encoding, a)
 	return fmt.Sprintf("%#x", encoding)
 }
 
+// String returns a hex-encoded representation of the E2 point.
 func (p *pointE2) String() string {
 	encoding := make([]byte, g2BytesLen)
 	writePointE2(encoding, p)
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index c6ad1facd97..efe660570db 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -536,7 +536,7 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) {
 // threshold signature scheme with a trusted dealer.
 //
 // The function returns :
-//   - (nil, nil, nil, invalidInputsErrorf) if:
+// - (nil, nil, nil, invalidInputsErrorf) if:
 //   - seed is too short
 //   - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`]
 //   - threshold value is not in interval [1, n-1]
diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go
index 3ce7f609f95..dbe7771b6c4 100644
--- a/crypto/dkg_feldmanvss.go
+++ b/crypto/dkg_feldmanvss.go
@@ -395,7 +395,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) {
 	}
 }
 
-// receives the public vector from the
+// receives the public vector from the dealer
 func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) {
 	// only accept the verification vector from the dealer.
 	if origin != s.dealerIndex {
diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go
index 67d97e9a854..b09d3d5922f 100644
--- a/crypto/ecdsa.go
+++ b/crypto/ecdsa.go
@@ -322,9 +322,6 @@ func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error)
 }
 
 // prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey
-
-var _ PrivateKey = (*prKeyECDSA)(nil)
-
 type prKeyECDSA struct {
 	// the signature algo
 	alg *ecdsaAlgo
@@ -334,6 +331,8 @@ type prKeyECDSA struct {
 	pubKey *pubKeyECDSA
 }
 
+var _ PrivateKey = (*prKeyECDSA)(nil)
+
 // Algorithm returns the algo related to the private key
 func (sk *prKeyECDSA) Algorithm() SigningAlgorithm {
 	return sk.alg.algo
@@ -395,9 +394,6 @@ func (sk *prKeyECDSA) String() string {
 }
 
 // pubKeyECDSA is the public key of ECDSA, it implements PublicKey
-
-var _ PublicKey = (*pubKeyECDSA)(nil)
-
 type pubKeyECDSA struct {
 	// the signature algo
 	alg *ecdsaAlgo
@@ -405,6 +401,8 @@ type pubKeyECDSA struct {
 	goPubKey *ecdsa.PublicKey
 }
 
+var _ PublicKey = (*pubKeyECDSA)(nil)
+
 // Algorithm returns the the algo related to the private key
 func (pk *pubKeyECDSA) Algorithm() SigningAlgorithm {
 	return pk.alg.algo

From 627d682bfb2b9bd3b2064f7d450958aa8f86368d Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 10 Oct 2023 19:36:05 -0500
Subject: [PATCH 186/200] add E1 random point multiplication benchmark

---
 crypto/bls12381_utils.go      |  6 +++---
 crypto/bls12381_utils_test.go | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go
index 41937bc18c2..65a54bb9dd4 100644
--- a/crypto/bls12381_utils.go
+++ b/crypto/bls12381_utils.go
@@ -103,8 +103,8 @@ func (p *pointE2) String() string {
 	return fmt.Sprintf("%#x", encoding)
 }
 
-// Scalar multiplication of a generic point `p` in G1
-func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) {
+// Scalar multiplication of a generic point `p` in E1
+func (p *pointE1) scalarMultE1(res *pointE1, expo *scalar) {
 	C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo))
 }
 
@@ -165,7 +165,7 @@ func randFr(x *scalar, rand random.Rand) bool {
 // and saves the random in `x`.
 func randFrStar(x *scalar, rand random.Rand) {
 	isZero := true
-	// exteremely unlikely this loop runs more than once,
+	// extremely unlikely this loop runs more than once,
 	// but force the output to be non-zero instead of propagating an error.
 	for isZero {
 		isZero = randFr(x, rand)
diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index ade31bbb6b9..e71702d7cbf 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -63,8 +63,8 @@ func BenchmarkScalarMult(b *testing.B) {
 	// G1 generator multiplication
 	// Note that generator and random point multiplications
 	// are implemented with the same algorithm
-	b.Run("G1", func(b *testing.B) {
-		var res pointE1
+	var res pointE1
+	b.Run("G1 gen", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			generatorScalarMultG1(&res, &expo)
@@ -72,10 +72,22 @@ func BenchmarkScalarMult(b *testing.B) {
 		b.StopTimer()
 	})
 
+	// E1 random point multiplication
+	// Note that generator and random point multiplications
+	// are implemented with the same algorithm
+	b.Run("E1 rand", func(b *testing.B) {
+		var res pointE1
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			res.scalarMultE1(&res, &expo)
+		}
+		b.StopTimer()
+	})
+
 	// G2 generator multiplication
 	// Note that generator and random point multiplications
 	// are implemented with the same algorithm
-	b.Run("G2", func(b *testing.B) {
+	b.Run("G2 gen", func(b *testing.B) {
 		var res pointE2
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {

From cf8667b93c5d1978a196a3464a42530324573570 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 10 Oct 2023 19:40:00 -0500
Subject: [PATCH 187/200] remove StopTimer in bench

---
 crypto/bls12381_utils_test.go | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index e71702d7cbf..257ec1afa1b 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -69,7 +69,6 @@ func BenchmarkScalarMult(b *testing.B) {
 		for i := 0; i < b.N; i++ {
 			generatorScalarMultG1(&res, &expo)
 		}
-		b.StopTimer()
 	})
 
 	// E1 random point multiplication
@@ -81,7 +80,6 @@ func BenchmarkScalarMult(b *testing.B) {
 		for i := 0; i < b.N; i++ {
 			res.scalarMultE1(&res, &expo)
 		}
-		b.StopTimer()
 	})
 
 	// G2 generator multiplication
@@ -93,7 +91,6 @@ func BenchmarkScalarMult(b *testing.B) {
 		for i := 0; i < b.N; i++ {
 			generatorScalarMultG2(&res, &expo)
 		}
-		b.StopTimer()
 	})
 }
 
@@ -145,7 +142,6 @@ func BenchmarkMapToG1(b *testing.B) {
 		p = mapToG1(input)
 	}
 	require.NotNil(b, p)
-	b.StopTimer()
 }
 
 // test subgroup membership check in G1 and G2
@@ -187,7 +183,6 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG1(&p) // G1
 		}
-		b.StopTimer()
 	})
 
 	b.Run("G2", func(b *testing.B) {
@@ -197,7 +192,6 @@ func BenchmarkSubgroupCheck(b *testing.B) {
 		for i := 0; i < b.N; i++ {
 			_ = checkMembershipG2(&p) // G2
 		}
-		b.StopTimer()
 	})
 }
 

From 5f89c65df3ec9813c76d76644e478d5a96e0c256 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 10 Oct 2023 19:52:08 -0500
Subject: [PATCH 188/200] comment updates and reformat

---
 crypto/bls12381_utils_test.go    |  2 +-
 crypto/bls_thresholdsign.go      | 18 +++++++++---------
 crypto/bls_thresholdsign_test.go |  2 --
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go
index 257ec1afa1b..a528e240363 100644
--- a/crypto/bls12381_utils_test.go
+++ b/crypto/bls12381_utils_test.go
@@ -220,7 +220,7 @@ func TestReadWriteG1(t *testing.T) {
 	t.Run("infinity", func(t *testing.T) {
 		var p, q pointE1
 		seed := make([]byte, frBytesLen)
-		unsafeMapToG1(&p, seed) // this results in the infinity point
+		unsafeMapToG1(&p, seed) // this results in the infinity point given how `unsafeMapToG1` works with an empty scalar
 		writePointE1(bytes, &p)
 		require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check
 		err := readPointE1(&q, bytes)
diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go
index efe660570db..412f06f962a 100644
--- a/crypto/bls_thresholdsign.go
+++ b/crypto/bls_thresholdsign.go
@@ -33,8 +33,6 @@ import (
 
 // blsThresholdSignatureParticipant implements ThresholdSignatureParticipant
 // based on the BLS signature scheme
-var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil)
-
 type blsThresholdSignatureParticipant struct {
 	// embed the follower
 	*blsThresholdSignatureInspector
@@ -44,10 +42,10 @@ type blsThresholdSignatureParticipant struct {
 	myPrivateKey PrivateKey
 }
 
+var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil)
+
 // blsThresholdSignatureInspector implements ThresholdSignatureInspector
 // based on the BLS signature scheme
-var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil)
-
 type blsThresholdSignatureInspector struct {
 	// size of the group
 	size int
@@ -72,6 +70,8 @@ type blsThresholdSignatureInspector struct {
 	lock sync.RWMutex
 }
 
+var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil)
+
 // NewBLSThresholdSignatureParticipant creates a new instance of Threshold signature Participant using BLS.
 // A participant is able to participate in a threshold signing protocol as well as following the
 // protocol.
@@ -82,8 +82,8 @@ type blsThresholdSignatureInspector struct {
 // participant is indexed by `myIndex` and holds the input private key
 // where n is the length of the public key shares slice.
 //
-// The function returns
-//   - (nil, invalidInputsError) if:
+// The function returns:
+// - (nil, invalidInputsError) if:
 //   - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`]
 //   - threshold value is not in interval [1, n-1]
 //   - input private key and public key at my index do not match
@@ -138,8 +138,8 @@ func NewBLSThresholdSignatureParticipant(
 // Participants are defined by their public key share, and are indexed from 0 to n-1
 // where n is the length of the public key shares slice.
 //
-// The function returns
-//   - (nil, invalidInputsError) if:
+// The function returns:
+// - (nil, invalidInputsError) if:
 //   - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`]
 //   - threshold value is not in interval [1, n-1]
 //   - (nil, notBLSKeyError) at least one public key is not of type pubKeyBLSBLS12381
@@ -535,7 +535,7 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) {
 // BLSThresholdKeyGen is a key generation for a BLS-based
 // threshold signature scheme with a trusted dealer.
 //
-// The function returns :
+// The function returns:
 // - (nil, nil, nil, invalidInputsErrorf) if:
 //   - seed is too short
 //   - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`]
diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go
index 20d578db264..9f3f83cb387 100644
--- a/crypto/bls_thresholdsign_test.go
+++ b/crypto/bls_thresholdsign_test.go
@@ -618,7 +618,6 @@ func BenchmarkSimpleKeyGen(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		_, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed)
 	}
-	b.StopTimer()
 }
 
 func BenchmarkSignatureReconstruction(b *testing.B) {
@@ -647,5 +646,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) {
 		_, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers)
 		require.NoError(b, err)
 	}
-	b.StopTimer()
 }

From 0587bc6e72ae1ee226bfdcd065236c77c4e0a439 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Thu, 12 Oct 2023 13:40:17 -0500
Subject: [PATCH 189/200] fix non-freed memory in error case

---
 crypto/bls12381_utils.c | 11 ++++++-----
 crypto/bls_core.c       | 17 ++++++++++-------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c
index 528f865cfdd..fc29046e47f 100644
--- a/crypto/bls12381_utils.c
+++ b/crypto/bls12381_utils.c
@@ -11,12 +11,12 @@
 
 // make sure flow crypto types are consistent with BLST types
 void types_sanity(void) {
-  assert(sizeof(vec256) == sizeof(Fr));
+  assert(sizeof(Fr) == sizeof(vec256));
   assert(sizeof(Fp) == sizeof(vec384));
-  assert(sizeof(vec384x) == sizeof(Fp2));
+  assert(sizeof(Fp2) == sizeof(vec384x));
   assert(sizeof(E1) == sizeof(POINTonE1));
   assert(sizeof(E2) == sizeof(POINTonE2));
-  assert(sizeof(vec384fp12) == sizeof(Fp12));
+  assert(sizeof(Fp12) == sizeof(vec384fp12));
 }
 
 // ------------------- Fr utilities
@@ -556,9 +556,9 @@ ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) {
 // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-)
 void E1_write_bytes(byte *out, const E1 *a) {
   if (E1_is_infty(a)) {
+    memset(out, 0, G1_SER_BYTES);
     // set the infinity bit
     out[0] = (G1_SERIALIZATION << 7) | (1 << 6);
-    memset(out + 1, 0, G1_SER_BYTES - 1);
     return;
   }
   E1 tmp;
@@ -620,8 +620,9 @@ int E1_sum_vector_byte(byte *out, const byte *in_bytes, const int in_len) {
   int n = in_len / G1_SER_BYTES; // number of signatures
 
   E1 *vec = (E1 *)malloc(n * sizeof(E1));
-  if (!vec)
+  if (!vec) {
     goto mem_error;
+  }
 
   // import the points from the array
   for (int i = 0; i < n; i++) {
diff --git a/crypto/bls_core.c b/crypto/bls_core.c
index 19d29f46713..65f510f5987 100644
--- a/crypto/bls_core.c
+++ b/crypto/bls_core.c
@@ -401,11 +401,18 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
 
   // build the arrays of G1 and G2 elements to verify
   E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2));
-  if (!pks)
+  if (!pks) {
     return;
+  }
   E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1));
-  if (!sigs)
+  if (!sigs) {
     goto out_sigs;
+  }
+
+  E1 h;
+  if (map_to_G1(&h, data, data_len) != VALID) {
+    goto out;
+  }
 
   for (int i = 0; i < sigs_len; i++) {
     // convert the signature points:
@@ -440,11 +447,7 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input,
   }
   // build a binary tree of aggregations
   node *root = build_tree(sigs_len, &pks[0], &sigs[0]);
-  if (!root)
-    goto out;
-
-  E1 h;
-  if (map_to_G1(&h, data, data_len) != VALID) {
+  if (!root) {
     goto out;
   }
 

From f64d5ea4fdd97372e35cde00d37092734e2bb3f3 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 20 Oct 2023 13:36:14 -0500
Subject: [PATCH 190/200] use a common logic to detect ADX support and set the
 crypto flag

---
 Makefile             | 17 +----------------
 crypto_adx_flag.mk   | 17 +++++++++++++++++
 insecure/Makefile    | 17 +----------------
 integration/Makefile | 17 +----------------
 4 files changed, 20 insertions(+), 48 deletions(-)
 create mode 100644 crypto_adx_flag.mk

diff --git a/Makefile b/Makefile
index 874d56f8d72..18415b12a16 100644
--- a/Makefile
+++ b/Makefile
@@ -39,23 +39,8 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging
 export CONTAINER_REGISTRY := gcr.io/flow-container-registry
 export DOCKER_BUILDKIT := 1
 
-# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
-ifeq ($(shell uname -s),Linux)
-# detect ADX support on the CURRENT linux machine.
-	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
-else
-# on non-linux machines, set the flag to 1 by default
-	ADX_SUPPORT := 1
-endif
+include crypto_adx_flag.mk
 
-# the crypto package uses BLST source files underneath which may use ADX insructions.
-ifeq ($(ADX_SUPPORT), 1)
-# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CRYPTO_FLAG := ""
-else
-# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
-endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 cmd/collection/collection:
diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk
new file mode 100644
index 00000000000..22c405ab45d
--- /dev/null
+++ b/crypto_adx_flag.mk
@@ -0,0 +1,17 @@
+# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
+ifeq ($(shell uname -s),Linux)
+# detect ADX support on the CURRENT linux machine.
+	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
+else
+# on non-linux machines, set the flag to 1 by default
+	ADX_SUPPORT := 1
+endif
+
+# the crypto package uses BLST source files underneath which may use ADX insructions.
+ifeq ($(ADX_SUPPORT), 1)
+# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+	CRYPTO_FLAG := ""
+else
+# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
+endif
\ No newline at end of file
diff --git a/insecure/Makefile b/insecure/Makefile
index fd6fdae0dd9..d1dc33fa216 100644
--- a/insecure/Makefile
+++ b/insecure/Makefile
@@ -8,23 +8,8 @@ else
 	RACE_FLAG :=
 endif
 
-# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
-ifeq ($(shell uname -s),Linux)
-# detect ADX support on the CURRENT linux machine.
-	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
-else
-# on non-linux machines, set the flag to 1 by default
-	ADX_SUPPORT := 1
-endif
+include ../crypto_adx_flag.mk
 
-# the crypto package uses BLST source files underneath which may use ADX insructions.
-ifeq ($(ADX_SUPPORT), 1)
-# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CRYPTO_FLAG := ""
-else
-# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
-endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 # runs all unit tests of the insecure module
diff --git a/integration/Makefile b/integration/Makefile
index 963b7093511..b29e5bcf873 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -8,23 +8,8 @@ else
 	RACE_FLAG :=
 endif
 
-# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise.
-ifeq ($(shell uname -s),Linux)
-# detect ADX support on the CURRENT linux machine.
-	ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi)
-else
-# on non-linux machines, set the flag to 1 by default
-	ADX_SUPPORT := 1
-endif
+include ../crypto_adx_flag.mk
 
-# the crypto package uses BLST source files underneath which may use ADX insructions.
-ifeq ($(ADX_SUPPORT), 1)
-# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
-	CRYPTO_FLAG := ""
-else
-# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
-	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
-endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
 # Run the integration test suite

From 6c34ae3fccb9a23887c95d790e141a7c7000ed3f Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 20 Oct 2023 14:01:32 -0500
Subject: [PATCH 191/200] add noop target for empty ci operations

---
 .github/workflows/ci.yml                 | 2 +-
 .github/workflows/flaky-test-monitor.yml | 2 +-
 Makefile                                 | 5 +++++
 crypto/blst_src/README.md                | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b24de2f44ca..9a88caa0e93 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -135,7 +135,7 @@ jobs:
       matrix:
         include:
           - name: crypto
-            setup: 
+            setup: noop
             retries: 1
             race: 1
           - name: insecure
diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml
index aa9d99dd65b..836af9c9228 100644
--- a/.github/workflows/flaky-test-monitor.yml
+++ b/.github/workflows/flaky-test-monitor.yml
@@ -83,7 +83,7 @@ jobs:
       matrix:
         include:
           - name: crypto
-            setup:
+            setup: noop
             race: 1
             test_category: unit-crypto
           - name: insecure
diff --git a/Makefile b/Makefile
index 18415b12a16..3dd74fac0af 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,11 @@ include crypto_adx_flag.mk
 
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
 
+# needed for CI
+.PHONY: noop
+noop:
+	@echo "This is a no-op target"
+
 cmd/collection/collection:
 	$(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go
 
diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md
index 50ca45ea7d6..c2e89a1de71 100644
--- a/crypto/blst_src/README.md
+++ b/crypto/blst_src/README.md
@@ -28,4 +28,4 @@ To upgrade the BLST version:
 - [ ] solve all breaking changes that may occur.
 - [ ] update the commit version on this `./blst_src/README`.
 
-Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto.
+Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should be done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto.

From b180269bb1f52da3d60bb6e4a1114262f9807860 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 20 Oct 2023 14:05:57 -0500
Subject: [PATCH 192/200] cleaning and typos

---
 Makefile             |  5 -----
 crypto/Makefile      | 11 +++--------
 crypto_adx_flag.mk   |  6 +++---
 integration/Makefile |  6 +-----
 4 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 3dd74fac0af..76e23178cb5 100644
--- a/Makefile
+++ b/Makefile
@@ -58,9 +58,6 @@ cmd/util/util:
 update-core-contracts-version:
 	./scripts/update-core-contracts.sh $(CC_VERSION)
 
-############################################################################################
-# CAUTION: DO NOT MODIFY THESE TARGETS! DOING SO WILL BREAK THE FLAKY TEST MONITOR
-
 .PHONY: unittest-main
 unittest-main:
 	# test all packages
@@ -84,8 +81,6 @@ install-tools: check-go-version install-mock-generators
 verify-mocks: tidy generate-mocks
 	git diff --exit-code
 
-############################################################################################
-
 .SILENT: go-math-rand-check
 go-math-rand-check:
 	# check that the insecure math/rand Go package isn't used by production code.
diff --git a/crypto/Makefile b/crypto/Makefile
index 43aae8ef39f..14016e40619 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -22,12 +22,12 @@ else
 	ADX_SUPPORT := 1
 endif
 
-# the crypto package uses BLST source files underneath which may use ADX insructions.
+# the crypto package uses BLST source files underneath which may use ADX instructions.
 ifeq ($(ADX_SUPPORT), 1)
-# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+# if ADX instructions are supported, default is to use a fast ADX BLST implementation 
 	CRYPTO_FLAG := ""
 else
-# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
 	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
 CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG)
@@ -75,8 +75,6 @@ c-sanitize: c-asan
 # - address sanitization and other checks (only on linux)
 # - memory sanitization (target m-san) is disabled because of multiple false positives
 
-
-
 # Go tidy
 .PHONY: go-tidy
 go-tidy:
@@ -90,9 +88,6 @@ lint: go-tidy
 	# revive -config revive.toml
 	golangci-lint run -v ./...
 	
-	
-
-
 # test all packages
 .PHONY: test
 test:
diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk
index 22c405ab45d..667c8d493d3 100644
--- a/crypto_adx_flag.mk
+++ b/crypto_adx_flag.mk
@@ -7,11 +7,11 @@ else
 	ADX_SUPPORT := 1
 endif
 
-# the crypto package uses BLST source files underneath which may use ADX insructions.
+# the crypto package uses BLST source files underneath which may use ADX instructions.
 ifeq ($(ADX_SUPPORT), 1)
-# if ADX insructions are supported, default is to use a fast ADX BLST implementation 
+# if ADX instructions are supported, default is to use a fast ADX BLST implementation 
 	CRYPTO_FLAG := ""
 else
-# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
+# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation 
 	CRYPTO_FLAG := "-O -D__BLST_PORTABLE__"
 endif
\ No newline at end of file
diff --git a/integration/Makefile b/integration/Makefile
index b29e5bcf873..1e73769a50f 100644
--- a/integration/Makefile
+++ b/integration/Makefile
@@ -19,10 +19,6 @@ integration-test: access-tests ghost-tests mvp-tests execution-tests verificatio
 .PHONY: ci-integration-test
 ci-integration-test: access-tests ghost-tests mvp-tests epochs-cohort1-tests epochs-cohort2-tests consensus-tests execution-tests verification-tests upgrades-tests network-tests collection-tests
 
-############################################################################################
-# CAUTION: DO NOT MODIFY THE TARGETS BELOW! DOING SO WILL BREAK THE FLAKY TEST MONITOR
-# In particular, do not skip tests by commenting them out here.
-
 # Run unit tests for test utilities in this module
 .PHONY: test
 test:
@@ -88,4 +84,4 @@ bft-gossipsub-tests:
 
 .PHONY: bft-tests
 bft-tests: bft-framework-tests bft-protocol-tests bft-gossipsub-tests
-############################################################################################
+

From 7ad6a7a5e93154fb051a98a5f704a3027ddefc57 Mon Sep 17 00:00:00 2001
From: Jordan Schalm <jordan@dapperlabs.com>
Date: Mon, 23 Oct 2023 09:38:28 -0700
Subject: [PATCH 193/200] make tidy

---
 go.sum          | 9 +--------
 insecure/go.sum | 6 +-----
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/go.sum b/go.sum
index 8feff29055b..3dfa262c25d 100644
--- a/go.sum
+++ b/go.sum
@@ -1323,13 +1323,8 @@ github.com/onflow/flow-core-contracts/lib/go/templates v0.14.0/go.mod h1:ZeLxwaB
 github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13 h1:B4ll7e3j+MqTJv2122Enq3RtDNzmIGRu9xjV7fo7un0=
 github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU=
 github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74=
-github.com/onflow/flow-go-sdk v0.41.10 h1:Cio6GJhtx532TUY+cqrqWglD5sZCXkWeM5QvaRha3p4=
-github.com/onflow/flow-go-sdk v0.41.10/go.mod h1:0a0LiQFbFt8RW/ptoMUU7YkvW9ArVcbjLE0XS78uz1E=
 github.com/onflow/flow-go-sdk v0.41.9 h1:cyplhhhc0RnfOAan2t7I/7C9g1hVGDDLUhWj6ZHAkk4=
 github.com/onflow/flow-go-sdk v0.41.9/go.mod h1:e9Q5TITCy7g08lkdQJxP8fAKBnBoC5FjALvUKr36j4I=
-github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ=
-github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs=
-github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0=
 github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0=
 github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY=
 github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8=
@@ -1775,10 +1770,8 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5
 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
-golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA=
 golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
diff --git a/insecure/go.sum b/insecure/go.sum
index a3258e00f67..d918d8fc6c0 100644
--- a/insecure/go.sum
+++ b/insecure/go.sum
@@ -1744,10 +1744,8 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5
 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
-golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
 golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA=
 golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -1856,8 +1854,6 @@ golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT
 golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
 golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50=
 golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA=

From f45e54690754564a4dfaa64548650cf52b9fdfda Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Mon, 23 Oct 2023 16:58:29 -0500
Subject: [PATCH 194/200] fix overwritten test settings

---
 .../tests/epochs/cohort2/epoch_join_and_leave_vn_test.go        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go
index 65569dacd08..a6612ced27c 100644
--- a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go
+++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go
@@ -28,7 +28,7 @@ func (s *EpochJoinAndLeaveVNSuite) SetupTest() {
 	s.DKGPhaseLen = 100
 	s.EpochLen = 450
 	s.EpochCommitSafetyThreshold = 20
-	s.DynamicEpochTransitionSuite.SetupTest()
+	s.Suite.SetupTest()
 }
 
 // TestEpochJoinAndLeaveVN should update verification nodes and assert healthy network conditions

From 99b1237305d2240fe86aaee700d64ca0ff829fc6 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 14 Nov 2023 20:14:19 -0500
Subject: [PATCH 195/200] remove crypto setup

---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5a9e450c264..9b7e7b8fdaf 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -190,8 +190,6 @@ jobs:
         with:
           go-version: ${{ env.GO_VERSION }}
           cache: true
-      - name: Build relic
-        run: make crypto_setup_gopath
       - name: Docker build
         run: make docker-build-flow docker-build-flow-corrupt
       - name: Save Docker images

From fbfecc1d0a1d321c67a6a79dea2bc841c57a9788 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 21 Nov 2023 17:17:21 -0500
Subject: [PATCH 196/200] update graceful stop duration to 1s in integration
 upgrade test

---
 integration/tests/upgrades/suite.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/integration/tests/upgrades/suite.go b/integration/tests/upgrades/suite.go
index dbc40e810aa..93094b8c13b 100644
--- a/integration/tests/upgrades/suite.go
+++ b/integration/tests/upgrades/suite.go
@@ -83,10 +83,12 @@ func (s *Suite) SetupTest() {
 			testnet.WithLogLevel(zerolog.WarnLevel),
 			testnet.WithID(s.exe1ID),
 			testnet.WithAdditionalFlag("--extensive-logging=true"),
+			testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"),
 		),
 		testnet.NewNodeConfig(
 			flow.RoleExecution,
 			testnet.WithLogLevel(zerolog.WarnLevel),
+			testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"),
 		),
 		testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...),
 		testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...),

From cd2d74e8b6b0d40b6805156943d678bb8db15f72 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Fri, 24 Nov 2023 16:56:40 -0500
Subject: [PATCH 197/200] slow down block rate in integration access cohort1
 test

---
 integration/tests/access/cohort1/access_api_test.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/integration/tests/access/cohort1/access_api_test.go b/integration/tests/access/cohort1/access_api_test.go
index cb5a175130d..24409f84ad2 100644
--- a/integration/tests/access/cohort1/access_api_test.go
+++ b/integration/tests/access/cohort1/access_api_test.go
@@ -87,7 +87,12 @@ func (s *AccessAPISuite) SetupTest() {
 	)
 
 	consensusConfigs := []func(config *testnet.NodeConfig){
-		testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=100ms"),
+		// `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 100ms
+		// to purposely slow down the block rate. This is needed since the crypto module
+		// update providing faster BLS operations.
+		// TODO: fix the access integration test logic to function without slowing down
+		// the block rate
+		testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"),
 		testnet.WithAdditionalFlagf("--required-verification-seal-approvals=%d", 1),
 		testnet.WithAdditionalFlagf("--required-construction-seal-approvals=%d", 1),
 		testnet.WithLogLevel(zerolog.FatalLevel),

From 769ad6727c2bc2e4963cb850e78f643286218030 Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Dec 2023 16:14:48 -0600
Subject: [PATCH 198/200] slow down block production in bft tests

---
 integration/tests/bft/base_suite.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/integration/tests/bft/base_suite.go b/integration/tests/bft/base_suite.go
index b50085a9e50..2e6e74de881 100644
--- a/integration/tests/bft/base_suite.go
+++ b/integration/tests/bft/base_suite.go
@@ -77,7 +77,12 @@ func (b *BaseSuite) SetupSuite() {
 			testnet.WithLogLevel(zerolog.FatalLevel),
 			testnet.WithAdditionalFlag("--required-verification-seal-approvals=1"),
 			testnet.WithAdditionalFlag("--required-construction-seal-approvals=1"),
-			testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=1ms"),
+			// `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 1ms
+			// to purposely slow down the block rate. This is needed since the crypto module
+			// update providing faster BLS operations.
+			// TODO: fix the access integration test logic to function without slowing down
+			// the block rate
+			testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"),
 		)
 		b.NodeConfigs = append(b.NodeConfigs, nodeConfig)
 	}

From f7dac6cff619f3f41bdd1679205ff50aa378d80a Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Dec 2023 16:42:06 -0600
Subject: [PATCH 199/200] slow down block rate in SN test

---
 .../epochs/cohort2/epoch_join_and_leave_sn_test.go     | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
index fb825e447a6..2073e693988 100644
--- a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
+++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
@@ -2,6 +2,7 @@ package cohort2
 
 import (
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/suite"
 
@@ -17,6 +18,15 @@ type EpochJoinAndLeaveSNSuite struct {
 	epochs.DynamicEpochTransitionSuite
 }
 
+func (s *EpochJoinAndLeaveSNSuite) SetupTest() {
+	// slow down the block rate. This is needed since the crypto module
+	// update provides faster BLS operations.
+	// TODO: fix the access integration test logic to function without slowing down
+	// the block rate
+	s.ConsensusProposalDuration = time.Millisecond * 250
+	s.Suite.SetupTest()
+}
+
 // TestEpochJoinAndLeaveSN should update consensus nodes and assert healthy network conditions
 // after the epoch transition completes. See health check function for details.
 func (s *EpochJoinAndLeaveSNSuite) TestEpochJoinAndLeaveSN() {

From e3e29f049b1bb3409d9d5dfc469a4e12c93f75fe Mon Sep 17 00:00:00 2001
From: Tarak Ben Youssef <tarak.benyoussef@dapperlabs.com>
Date: Tue, 5 Dec 2023 17:15:37 -0600
Subject: [PATCH 200/200] fix SN integration test bug

---
 .../tests/epochs/cohort2/epoch_join_and_leave_sn_test.go        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
index 2073e693988..d101af6371d 100644
--- a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
+++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go
@@ -24,7 +24,7 @@ func (s *EpochJoinAndLeaveSNSuite) SetupTest() {
 	// TODO: fix the access integration test logic to function without slowing down
 	// the block rate
 	s.ConsensusProposalDuration = time.Millisecond * 250
-	s.Suite.SetupTest()
+	s.DynamicEpochTransitionSuite.SetupTest()
 }
 
 // TestEpochJoinAndLeaveSN should update consensus nodes and assert healthy network conditions